Skip to content

Commit

Permalink
fix: support negative index, multislice in tidyselect (#430)
Browse files Browse the repository at this point in the history
* fix: support negative index, multislice in tidyselect

* fix: tidyselect add VarAnd type

* refactor: prep tidyselect.py file

* refactor: move to tidyselect.py

* fix: grouped nest with new tidyselect

* fix: tidyselect Var should acecept a Call
  • Loading branch information
machow authored Aug 26, 2022
1 parent 84ab241 commit 0d48e89
Show file tree
Hide file tree
Showing 5 changed files with 251 additions and 162 deletions.
40 changes: 33 additions & 7 deletions examples/examples-varspec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"metadata": {},
"outputs": [],
"source": [
"from siuba.dply.verbs import Var, VarList, var_select\n",
"from siuba.dply.tidyselect import Var, VarList, var_select\n",
"from siuba import _\n",
"import pandas as pd\n",
"from pandas import DataFrame, Series"
Expand Down Expand Up @@ -40,8 +40,8 @@
"colnames = ['a', 'b', 'c', 'd']\n",
"var_select(\n",
" colnames,\n",
" -v.d, \n",
" v.x == v.a,\n",
" -v.d,\n",
" )"
]
},
Expand Down Expand Up @@ -108,6 +108,32 @@
")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OrderedDict([('home_phone', None),\n",
" ('home_address', None),\n",
" ('other', None),\n",
" ('misc', None)])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"var_select(\n",
" colnames,\n",
" v[:]\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -117,16 +143,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Var('_.startswith('a')', negated = False, alias = None)"
"Var(_.startswith('a'), negated=False, alias=None)"
]
},
"execution_count": 5,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -139,7 +165,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -153,7 +179,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
"version": "3.8.12"
},
"toc": {
"base_numbering": 1,
Expand Down
199 changes: 199 additions & 0 deletions siuba/dply/tidyselect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import pandas as pd

from siuba.siu import Call, MetaArg, BinaryOp
from collections import OrderedDict
from itertools import chain

class Var:
def __init__(self, name: "str | int | slice | Call", negated = False, alias = None):
if not isinstance(name, (str, int, slice, Call)):
raise TypeError(f"Var name cannot be type: {type(name)}.")
self.name = name
self.negated = negated
self.alias = alias

def __neg__(self):
return self.to_copy(negated = not self.negated)


def __invert__(self):
return self.to_copy(negated = not self.negated)

def __eq__(self, x):
name = x.name if isinstance(x, Var) else x
return self.to_copy(name = name, negated = False, alias = self.name)

def __call__(self, *args, **kwargs):
call = Call('__call__',
BinaryOp('__getattr__', MetaArg("_"), self.name),
*args,
**kwargs
)

return self.to_copy(name = call)


def __repr__(self):
cls_name = self.__class__.__name__
sig = f"{repr(self.name)}, negated={self.negated}, alias={self.alias}"
return f"{cls_name}({sig})"

def __str__(self):
op = "-" if self.negated else ""
pref = self.alias + " = " if self.alias else ""
return "{pref}{op}{self.name}".format(pref = pref, op = op, self = self)

def to_copy(self, **kwargs):
return self.__class__(**{**self.__dict__, **kwargs})


class VarAnd(Var):
name: "tuple[Var]"

def __init__(self, name: "tuple[Var]", negated=False, alias=None):
self.name = name
self.negated = negated

bad_var = [x for x in name if not isinstance(x, Var)]

if any(bad_var):
raise TypeError(f"VarAnd expects a tuple of Var, but saw entries: {bad_var}")

if alias is not None:
raise TypeError("alias must be none for VarAnd (extended slice syntax)")

self.alias = None

def __eq__(self, x):
raise NotImplementedError()

def __call__(self, *args, **kwargs):
raise NotImplementedError()

def flatten(self) -> "tuple[Var]":
res = []
for var in self.name:
neg_var = ~var if self.negated else var
if isinstance(neg_var, VarAnd):
res.extend(neg_var.flatten())
else:
res.append(neg_var)

return tuple(res)



class VarList:
def __getattr__(self, x):
return Var(x)

def __getitem__(self, x):
if not isinstance(x, tuple):
return Var(x) if not isinstance(x, Var) else x
else:
res = [el if isinstance(el, Var) else Var(el) for el in x]
return VarAnd(tuple(res))


def var_slice(colnames, x):
"""Return indices in colnames correspnding to start and stop of slice."""
# TODO: produces bahavior similar to df.loc[:, "V1":"V3"], but can reverse
# TODO: make DRY
# TODO: reverse not including end points
if isinstance(x.start, Var):
start_indx = (colnames == x.start.name).idxmax()
elif isinstance(x.start, str):
start_indx = (colnames == x.start).idxmax()
else:
start_indx = x.start or 0

if isinstance(x.stop, Var):
stop_indx = (colnames == x.stop.name).idxmax() + 1
elif isinstance(x.stop, str):
stop_indx = (colnames == x.stop).idxmax() + 1
else:
stop_indx = x.stop or len(colnames)

if start_indx > stop_indx:
return stop_indx, start_indx
else:
return start_indx, stop_indx

def var_put_cols(name, var, cols):
if isinstance(name, list) and var.alias is not None:
raise Exception("Cannot assign name to multiple columns")

names = [name] if not isinstance(name, list) else name

for name in names:
if var.negated:
if name in cols: cols.pop(name)
#elif name in cols: cols.move_to_end(name)
else: cols[name] = var.alias

def flatten_var(var):
if isinstance(var, VarAnd):
return var.flatten()
return [var]


def var_select(colnames, *args):
# TODO: don't erase named column if included again
colnames = colnames if isinstance(colnames, pd.Series) else pd.Series(colnames)
cols = OrderedDict()

#flat_args = var_flatten(args)
all_vars = chain(*map(flatten_var, args))

# Add entries in pandas.rename style {"orig_name": "new_name"}
for ii, arg in enumerate(all_vars):
# strings are added directly
if isinstance(arg, str):
cols[arg] = None
# integers add colname at corresponding index
elif isinstance(arg, int):
cols[colnames.iloc[arg]] = None
# general var handling
elif isinstance(arg, Var):
# remove negated Vars, otherwise include them
if ii == 0 and arg.negated:
# if negation used as first arg apply an implicit everything
cols.update((k, None) for k in colnames)

# slicing can refer to single, or range of columns
if isinstance(arg.name, slice):
start, stop = var_slice(colnames, arg.name)
for ii in range(start, stop):
var_put_cols(colnames[ii], arg, cols)
# method calls like endswith()
elif callable(arg.name):
# TODO: not sure if this is a good idea...
# basically proxies to pandas str methods (they must return bool array)
indx = arg.name(colnames.str)
var_put_cols(colnames[indx].tolist(), arg, cols)
#cols.update((x, None) for x in set(colnames[indx]) - set(cols))
elif isinstance(arg.name, int):
var_put_cols(colnames.iloc[arg.name], arg, cols)
else:
var_put_cols(arg.name, arg, cols)
else:
raise Exception("variable must be either a string or Var instance")

return cols


def var_create(*args) -> "tuple[Var]":
vl = VarList()
all_vars = []
for arg in args:
if callable(arg) and not isinstance(arg, Var):
res = arg(vl)
if isinstance(res, VarList):
raise ValueError("Must select specific column. Did you pass `_` to select?")
all_vars.append(res)
elif isinstance(arg, Var):
all_vars.append(arg)
else:
all_vars.append(Var(arg))

return tuple(all_vars)
Loading

0 comments on commit 0d48e89

Please sign in to comment.