Skip to content

Commit

Permalink
Merge pull request #121 from davidanthoff/simple-map-syntax
Browse files Browse the repository at this point in the history
Add a..b syntax
  • Loading branch information
davidanthoff authored Jul 22, 2017
2 parents 95d8136 + 083e115 commit d93b21f
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 4 deletions.
31 changes: 31 additions & 0 deletions docs/src/querycommands.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,37 @@ println(x)
│ 2 │ 2 │ 2 │
```

## Split-Apply-Combine (a.k.a. `dplyr`)

`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.

#### Example

```jldoctest
using Query, DataFrames
df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
age=vcat([10., 20., 30.],[10., 20., 30.].+3),
children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])
x = @from i in df begin
@group i by i.state into g
@select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
@collect DataFrame
end
println(x)
# Output
2×4 DataFrames.DataFrame
│ Row │ group │ mage │ oldest │ youngest │
├─────┼───────┼──────┼────────┼──────────┤
│ 1 │ a │ 20.0 │ 30.0 │ 10.0 │
│ 2 │ b │ 23.0 │ 33.0 │ 13.0 │
```

## Range variables

The `@let` statement introduces new range variables in a query expression. The syntax for the range statement is `@let <range variable> = <value selector>`. `<range variable>` specifies the name of the new range variable and `<value selector>` is any julia expression that returns the value that should be assigned to the new range variable.
Expand Down
14 changes: 14 additions & 0 deletions example/25-ab-syntax.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using Query
using DataFrames

df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
age=vcat([10., 20., 30.],[10., 20., 30.].+3),
children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])

x = @from i in df begin
@group i by i.state into g
@select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
@collect DataFrame
end

println(x)
2 changes: 1 addition & 1 deletion src/Query.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ using NamedTuples
using DataStructures
using IterableTables
using DataValues
import MacroTools
using MacroTools: postwalk

import Base.start
import Base.next
Expand Down
14 changes: 12 additions & 2 deletions src/query_translation.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
function helper_namedtuples_replacement(ex)
return MacroTools.postwalk(ex) do x
return postwalk(ex) do x
if x isa Expr && x.head==:cell1d
new_ex = Expr(:macrocall, Symbol("@NT"), x.args...)

Expand All @@ -24,7 +24,7 @@ end
function helper_replace_anon_func_syntax(ex)
if !(isa(ex, Expr) && ex.head==:->)
new_symb = gensym()
new_ex = MacroTools.postwalk(ex) do x
new_ex = postwalk(ex) do x
if isa(x, Symbol) && x==:_
return new_symb
else
Expand Down Expand Up @@ -52,6 +52,16 @@ function query_expression_translation_phase_A(qe)
end
i+=1
end

for i in 1:length(qe)
qe[i] = postwalk(qe[i]) do x
if x isa Expr && x.head==:call && x.args[1]==:(..)
return :(map(i->i.$(x.args[3]),$(x.args[2])))
else
return x
end
end
end
end

function query_expression_translation_phase_B(qe)
Expand Down
4 changes: 3 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,7 @@ q = collect(Query.@select(source_df, i->get(i.children)))

include("test_indexedtables.jl")
include("test_pipesyntax.jl")
include("test_dplyr-syntax.jl")

end

Expand All @@ -576,7 +577,8 @@ end
"../example/21-nulls.jl",
"../example/22-datastreams-sink.jl",
"../example/23-dict-sink.jl",
"../example/24-DataTable.jl"]
"../example/24-DataTable.jl",
"../example/25-ab-syntax.jl"]

color = Base.have_color ? "--color=yes" : "--color=no"
compilecache = "--compilecache=" * (Bool(Base.JLOptions().use_compilecache) ? "yes" : "no")
Expand Down
27 changes: 27 additions & 0 deletions test/test_dplyr-syntax.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using Query
using DataFrames
using Base.Test



@testset "a..b Syntax (dplyr API)" begin

df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
age=vcat([10., 20., 30.],[10., 20., 30.].+3),
children=repeat([3,2,2],inner=[1],outer=[2]),state=[:a,:a,:a,:b,:b,:b])

x = @from i in df begin
@group i by i.state into g
@select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
@collect DataFrame
end

@test x isa DataFrame
@test size(x) == (2,4)
@test x[1,:mage] == 20
@test x[2,:mage] == 23
@test x[1,:oldest] == 30
@test x[2,:oldest] == 33
@test x[1,:youngest] == 10
@test x[2,:youngest] == 13
end

0 comments on commit d93b21f

Please sign in to comment.