pandas-dev · MarcoGorelli · Dec 2, 2022 · Dec 2, 2022 · Dec 2, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,6 +27,7 @@ repos:
     rev: v0.9.1
     hooks:
     -   id: cython-lint
+    -   id: double-quote-cython-strings
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.4.0
     hooks:

@@ -180,7 +180,7 @@ def is_lexsorted(list_of_arrays: list) -> bint:
     cdef int64_t **vecs = <int64_t**>malloc(nlevels * sizeof(int64_t*))
     for i in range(nlevels):
         arr = list_of_arrays[i]
-        assert arr.dtype.name == 'int64'
+        assert arr.dtype.name == "int64"
         vecs[i] = <int64_t*>cnp.PyArray_DATA(arr)
 
     # Assume uniqueness??
@@ -514,9 +514,9 @@ def validate_limit(nobs: int | None, limit=None) -> int:
         lim = nobs
     else:
         if not util.is_integer_object(limit):
-            raise ValueError('Limit must be an integer')
+            raise ValueError("Limit must be an integer")
         if limit < 1:
-            raise ValueError('Limit must be greater than 0')
+            raise ValueError("Limit must be greater than 0")
         lim = limit
 
     return lim
@@ -958,7 +958,7 @@ def rank_1d(
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
 
-    keep_na = na_option == 'keep'
+    keep_na = na_option == "keep"
 
     N = len(values)
     if labels is not None:
@@ -984,7 +984,7 @@ def rank_1d(
     # with mask, without obfuscating location of missing data
     # in values array
     if numeric_object_t is object and values.dtype != np.object_:
-        masked_vals = values.astype('O')
+        masked_vals = values.astype("O")
     else:
         masked_vals = values.copy()
 
@@ -1005,7 +1005,7 @@ def rank_1d(
     # If descending, fill with highest value since descending
     # will flip the ordering to still end up with lowest rank.
     # Symmetric logic applies to `na_option == 'bottom'`
-    nans_rank_highest = ascending ^ (na_option == 'top')
+    nans_rank_highest = ascending ^ (na_option == "top")
     nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
     if nans_rank_highest:
         order = [masked_vals, mask]
@@ -1345,7 +1345,7 @@ def rank_2d(
         if not ascending:
             tiebreak = TIEBREAK_FIRST_DESCENDING
 
-    keep_na = na_option == 'keep'
+    keep_na = na_option == "keep"
 
     # For cases where a mask is not possible, we can avoid mask checks
     check_mask = (
@@ -1362,9 +1362,9 @@ def rank_2d(
 
     if numeric_object_t is object:
         if values.dtype != np.object_:
-            values = values.astype('O')
+            values = values.astype("O")
 
-    nans_rank_highest = ascending ^ (na_option == 'top')
+    nans_rank_highest = ascending ^ (na_option == "top")
     if check_mask:
         nan_fill_val = get_rank_nan_fill_val(nans_rank_highest, <numeric_object_t>0)
 
@@ -1385,7 +1385,7 @@ def rank_2d(
         order = (values, ~np.asarray(mask))
 
     n, k = (<object>values).shape
-    out = np.empty((n, k), dtype='f8', order='F')
+    out = np.empty((n, k), dtype="f8", order="F")
     grp_sizes = np.ones(n, dtype=np.int64)
 
     # lexsort is slower, so only use if we need to worry about the mask

@@ -604,12 +604,12 @@ def group_any_all(
         intp_t lab
         int8_t flag_val, val
 
-    if val_test == 'all':
+    if val_test == "all":
         # Because the 'all' value of an empty iterable in Python is True we can
         # start with an array full of ones and set to zero when a False value
         # is encountered
         flag_val = 0
-    elif val_test == 'any':
+    elif val_test == "any":
         # Because the 'any' value of an empty iterable in Python is False we
         # can start with an array full of zeros and set to one only if any
         # value encountered is True
@@ -1061,7 +1061,7 @@ def group_ohlc(
     N, K = (<object>values).shape
 
     if out.shape[1] != 4:
-        raise ValueError('Output array must have 4 columns')
+        raise ValueError("Output array must have 4 columns")
 
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only one dimension")
@@ -1157,11 +1157,11 @@ def group_quantile(
         )
 
     inter_methods = {
-        'linear': INTERPOLATION_LINEAR,
-        'lower': INTERPOLATION_LOWER,
-        'higher': INTERPOLATION_HIGHER,
-        'nearest': INTERPOLATION_NEAREST,
-        'midpoint': INTERPOLATION_MIDPOINT,
+        "linear": INTERPOLATION_LINEAR,
+        "lower": INTERPOLATION_LOWER,
+        "higher": INTERPOLATION_HIGHER,
+        "nearest": INTERPOLATION_NEAREST,
+        "midpoint": INTERPOLATION_MIDPOINT,
     }
     interp = inter_methods[interpolation]
 

@@ -184,8 +184,8 @@ cdef class IndexEngine:
         if self.is_monotonic_increasing:
             values = self.values
             try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
+                left = values.searchsorted(val, side="left")
+                right = values.searchsorted(val, side="right")
             except TypeError:
                 # e.g. GH#29189 get_loc(None) with a Float64Index
                 #  2021-09-29 Now only reached for object-dtype
@@ -353,8 +353,8 @@ cdef class IndexEngine:
             remaining_stargets = set()
             for starget in stargets:
                 try:
-                    start = values.searchsorted(starget, side='left')
-                    end = values.searchsorted(starget, side='right')
+                    start = values.searchsorted(starget, side="left")
+                    end = values.searchsorted(starget, side="right")
                 except TypeError:  # e.g. if we tried to search for string in int array
                     remaining_stargets.add(starget)
                 else:
@@ -551,7 +551,7 @@ cdef class DatetimeEngine(Int64Engine):
                 return self._get_loc_duplicates(conv)
             values = self.values
 
-            loc = values.searchsorted(conv, side='left')
+            loc = values.searchsorted(conv, side="left")
 
             if loc == len(values) or values[loc] != conv:
                 raise KeyError(val)
@@ -655,8 +655,8 @@ cdef class BaseMultiIndexCodesEngine:
         # with positive integers (-1 for NaN becomes 1). This enables us to
         # differentiate between values that are missing in other and matching
         # NaNs. We will set values that are not found to 0 later:
-        labels_arr = np.array(labels, dtype='int64').T + multiindex_nulls_shift
-        codes = labels_arr.astype('uint64', copy=False)
+        labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
+        codes = labels_arr.astype("uint64", copy=False)
         self.level_has_nans = [-1 in lab for lab in labels]
 
         # Map each codes combination in the index to an integer unambiguously
@@ -693,7 +693,7 @@ cdef class BaseMultiIndexCodesEngine:
             if self.level_has_nans[i] and codes.hasnans:
                 result[codes.isna()] += 1
             level_codes.append(result)
-        return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
+        return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
 
     def get_indexer(self, target: np.ndarray) -> np.ndarray:
         """
@@ -754,12 +754,12 @@ cdef class BaseMultiIndexCodesEngine:
             ndarray[int64_t, ndim=1] new_codes, new_target_codes
             ndarray[intp_t, ndim=1] sorted_indexer
 
-        target_order = np.argsort(target).astype('int64')
+        target_order = np.argsort(target).astype("int64")
         target_values = target[target_order]
         num_values, num_target_values = len(values), len(target_values)
         new_codes, new_target_codes = (
-            np.empty((num_values,)).astype('int64'),
-            np.empty((num_target_values,)).astype('int64'),
+            np.empty((num_values,)).astype("int64"),
+            np.empty((num_target_values,)).astype("int64"),
         )
 
         # `values` and `target_values` are both sorted, so we walk through them
@@ -809,7 +809,7 @@ cdef class BaseMultiIndexCodesEngine:
             raise KeyError(key)
 
         # Transform indices into single integer:
-        lab_int = self._codes_to_ints(np.array(indices, dtype='uint64'))
+        lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
 
         return self._base.get_loc(self, lab_int)
 
@@ -940,8 +940,8 @@ cdef class SharedEngine:
         if self.is_monotonic_increasing:
             values = self.values
             try:
-                left = values.searchsorted(val, side='left')
-                right = values.searchsorted(val, side='right')
+                left = values.searchsorted(val, side="left")
+                right = values.searchsorted(val, side="right")
             except TypeError:
                 # e.g. GH#29189 get_loc(None) with a Float64Index
                 raise KeyError(val)

@@ -69,7 +69,7 @@ cdef class BlockPlacement:
                 or not cnp.PyArray_ISWRITEABLE(val)
                 or (<ndarray>val).descr.type_num != cnp.NPY_INTP
             ):
-                arr = np.require(val, dtype=np.intp, requirements='W')
+                arr = np.require(val, dtype=np.intp, requirements="W")
             else:
                 arr = val
             # Caller is responsible for ensuring arr.ndim == 1

@@ -42,7 +42,7 @@ from pandas._libs.tslibs.util cimport (
     is_timedelta64_object,
 )
 
-VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither'])
+VALID_CLOSED = frozenset(["left", "right", "both", "neither"])
 
 
 cdef class IntervalMixin:
@@ -59,7 +59,7 @@ cdef class IntervalMixin:
         bool
             True if the Interval is closed on the left-side.
         """
-        return self.closed in ('left', 'both')
+        return self.closed in ("left", "both")
 
     @property
     def closed_right(self):
@@ -73,7 +73,7 @@ cdef class IntervalMixin:
         bool
             True if the Interval is closed on the left-side.
         """
-        return self.closed in ('right', 'both')
+        return self.closed in ("right", "both")
 
     @property
     def open_left(self):
@@ -172,9 +172,9 @@ cdef class IntervalMixin:
         >>> pd.IntervalIndex(ivs).is_empty
         array([ True, False])
         """
-        return (self.right == self.left) & (self.closed != 'both')
+        return (self.right == self.left) & (self.closed != "both")
 
-    def _check_closed_matches(self, other, name='other'):
+    def _check_closed_matches(self, other, name="other"):
         """
         Check if the closed attribute of `other` matches.
 
@@ -197,9 +197,9 @@ cdef class IntervalMixin:
 
 
 cdef bint _interval_like(other):
-    return (hasattr(other, 'left')
-            and hasattr(other, 'right')
-            and hasattr(other, 'closed'))
+    return (hasattr(other, "left")
+            and hasattr(other, "right")
+            and hasattr(other, "closed"))
 
 
 cdef class Interval(IntervalMixin):
@@ -311,7 +311,7 @@ cdef class Interval(IntervalMixin):
     Either ``left``, ``right``, ``both`` or ``neither``.
     """
 
-    def __init__(self, left, right, str closed='right'):
+    def __init__(self, left, right, str closed="right"):
         # note: it is faster to just do these checks than to use a special
         # constructor (__cinit__/__new__) to avoid them
 
@@ -343,8 +343,8 @@ cdef class Interval(IntervalMixin):
 
     def __contains__(self, key) -> bool:
         if _interval_like(key):
-            key_closed_left = key.closed in ('left', 'both')
-            key_closed_right = key.closed in ('right', 'both')
+            key_closed_left = key.closed in ("left", "both")
+            key_closed_right = key.closed in ("right", "both")
             if self.open_left and key_closed_left:
                 left_contained = self.left < key.left
             else:
@@ -389,15 +389,15 @@ cdef class Interval(IntervalMixin):
 
         left, right = self._repr_base()
         name = type(self).__name__
-        repr_str = f'{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})'
+        repr_str = f"{name}({repr(left)}, {repr(right)}, closed={repr(self.closed)})"
         return repr_str
 
     def __str__(self) -> str:
 
         left, right = self._repr_base()
-        start_symbol = '[' if self.closed_left else '('
-        end_symbol = ']' if self.closed_right else ')'
-        return f'{start_symbol}{left}, {right}{end_symbol}'
+        start_symbol = "[" if self.closed_left else "("
+        end_symbol = "]" if self.closed_right else ")"
+        return f"{start_symbol}{left}, {right}{end_symbol}"
 
     def __add__(self, y):
         if (