Update some docs and benchmarks

queryverse · Aug 30, 2018 · e6ab23d · e6ab23d
1 parent f60b257
commit e6ab23d
Show file tree

Hide file tree

Showing 7 changed files with 40 additions and 80 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -4,4 +4,4 @@ site: https://discourse.julialang.org/c/domain/data. I use the GitHub
 issue tracker for bug reports and feature requests only.
 
 By contributing code to Query.jl, you are agreeing to release it under
-the [MIT License](https://github.com/davidanthoff/Query.jl/blob/master/LICENSE.md).
+the [MIT License](https://github.com/queryverse/Query.jl/blob/master/LICENSE.md).
diff --git a/benchmark/Rdatatable.jl b/benchmark/Rdatatable.jl
@@ -132,52 +132,52 @@ function benches(df::DataFrame)
 
     ti[:sum1] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum2] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum3] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum4] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum_mean1] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum_mean2] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     return ti
@@ -189,52 +189,52 @@ function benches(df::DataTable)
 
     ti[:sum1] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum2] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum3] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum4] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum_mean1] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum_mean2] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     return ti
@@ -246,52 +246,52 @@ function benches(df::IndexedTable)
 
     ti[:sum1] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum2] = @elapsed @from i in df begin
                              @group i by i.id1 into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum3] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum4] = @elapsed @from i in df begin
                              @group i by (i.id1,i.id2) into g
-                             @select {r=sum(g..v1)}
+                             @select {r=sum(g.v1)}
                              @collect DataFrame
                          end
     ti[:sum_mean1] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum_mean2] = @elapsed @from i in df begin
                              @group i by i.id3 into g
-                             @select {s=sum(g..v1),m=mean(g..v3)}
+                             @select {s=sum(g.v1),m=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_1] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:mean7_9_by_id4_2] = @elapsed @from i in df begin
                              @group i by i.id4 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_1] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     ti[:sum7_9_by_id6_2] = @elapsed @from i in df begin
                              @group i by i.id6 into g
-                             @select {m7=mean(g..v1),m8=mean(g..v2),m9=mean(g..v3)}
+                             @select {m7=mean(g.v1),m8=mean(g.v2),m9=mean(g.v3)}
                              @collect DataFrame
                          end
     return ti

diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
@@ -18,13 +18,13 @@ using DataTables
 
     @bench "two columns" @from i in $dt begin
         @group {i.A, i.B} by i.B into g
-        @select {m = mean(g..A)}
+        @select {m = mean(g.A)}
         @collect 
     end
 
     @bench "three columns" @from i in $dt begin
         @group {i.A, i.B, i.C} by i.B into g
-        @select {m = mean(g..A)}
+        @select {m = mean(g.A)}
         @collect 
     end
 end

diff --git a/docs/src/experimental.md b/docs/src/experimental.md
@@ -19,14 +19,14 @@ df = DataFrame(a=[1,1,2,3], b=[4,5,6,8])
 
 df2 = df |>
     @groupby(_.a) |>
-    @map({a=_.key, b=mean(_..b)}) |>
+    @map({a=key(_), b=mean(_.b)}) |>
     @filter(_.b > 5) |>
     @orderby_descending(_.b) |>
     DataFrame
 ```
 
 This example makes use of three experimental features: 1) the standalone
-query commands, 2) the `..` syntax and 3) the `_` anonymous function syntax.
+query commands, 2) the `.` syntax and 3) the `_` anonymous function syntax.
 
 ## Standalone query operators
 
@@ -137,31 +137,6 @@ The `@take` command has the form `@take(source, n)`. `source` can be any source
 
 The `@drop` command has the form `@drop(source, n)`. `source` can be any source that can be queried. `n` must be an integer, and it specifies how many elements from the beginning of the source should be dropped from the results.
 
-## The `..` syntax
-
-The syntax `a..b` is translated into `map(i->i.b, a)` in any query
-expression. This is especially helpful when computing some reduction of
-a given column of a grouped table.
-
-For example, the following command groups a table by column `a`, and then
-computes the mean of the `b` column for each group:
-
-```julia
-using DataFrames, Query
-
-df = DataFrame(a=[1,1,2,3], b=[4,5,6,8])
-
-@from i in df begin
-    @group i by i.a into g
-    @select {a=i.key, b=mean(g..b)}
-    @collect DataFrame
-end
-```
-
-The `@group` command here creates a list of tables, i.e. `g` will hold
-a full table for each group. The syntax `g..b` then extracts a single
-column from that table.
-
 ## The `_` and `__` syntax
 
 This syntax only works in the standalone query commands. Instead of writing

diff --git a/docs/src/gettingstarted.md b/docs/src/gettingstarted.md
@@ -42,7 +42,7 @@ The Query package does not require data sources or sinks to have a table like st
 ## Missing values
 
 Missing values are represented as `DataValue` types from the
-[DataValues.jl](https://github.com/davidanthoff/DataValues.jl) package.
+[DataValues.jl](https://github.com/queryverse/DataValues.jl) package.
 Here are some usage tips.
 
 All arithmetic operators work automatically with missing values.

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -2,22 +2,7 @@
 
 ## Overview
 
-Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/davidanthoff/IterableTables.jl). One can for example query any of the following data sources:
-any array,
-[DataFrames](https://github.com/JuliaStats/DataFrames.jl),
-[DataStreams](https://github.com/JuliaData/DataStreams.jl)
-(including [CSV](https://github.com/JuliaData/CSV.jl),
-[Feather](https://github.com/JuliaStats/Feather.jl),
-[SQLite](https://github.com/JuliaDB/SQLite.jl),
-[ODBC](https://github.com/JuliaDB/ODBC.jl)),
-[DataTables](https://github.com/JuliaData/DataTables.jl),
-[IndexedTables](https://github.com/JuliaComputing/IndexedTables.jl),
-[TimeSeries](https://github.com/JuliaStats/TimeSeries.jl),
-[Temporal](https://github.com/dysonance/Temporal.jl),
-[TypedTables](https://github.com/FugroRoames/TypedTables.jl) and
-[DifferentialEquations](https://github.com/JuliaDiffEq/DifferentialEquations.jl) (any `DESolution`).
-
-The package currently provides working implementations for in-memory data sources, but will eventually be able to translate queries into e.g. SQL. There is a prototype implementation of such a "query provider" for [SQLite](https://github.com/JuliaDB/SQLite.jl) in the package, but it is experimental at this point and only works for a *very* small subset of queries.
+Query is a package for querying julia data sources. It can filter, project, join and group data from any iterable data source, including all the sources supported in [IterableTables.jl](https://github.com/queryverse/IterableTables.jl).
 
 Query is heavily inspired by [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx), in fact right now the package is largely an implementation of the [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) part of the [C# specification](https://msdn.microsoft.com/en-us/library/ms228593.aspx). Future versions of Query will most likely add features that are not found in the original [LINQ](https://msdn.microsoft.com/en-us/library/bb397926.aspx) design.
 

diff --git a/docs/src/querycommands.md b/docs/src/querycommands.md
@@ -268,7 +268,7 @@ df = DataFrame(name=["John", "Sally", "Kirk"], age=[23., 42., 59.], children=[3,
 
 x = @from i in df begin
     @group i by i.children into g
-    @select {Key=g.key,Count=length(g)}
+    @select {Key=key(g),Count=length(g)}
     @collect DataFrame
 end
 
@@ -285,7 +285,7 @@ println(x)
 
 ## Split-Apply-Combine (a.k.a. `dplyr`)
 
-`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g..var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.
+`Query.jl` provides special syntax to summarise data in a `Query.Grouping` as above. *Summarising* here is synonymous to *aggregating* or *collapsing* the dataset over a certain grouping variable. Summarising thus requires an aggregating function like `mean`, `maximum`, or any other function that takes a vector and returns a scalar. The special syntax is `@select new_var = agg_fun(g.var)`, where `agg_fun` is your aggregation function (e.g. `mean`), `g` is your grouping, and `var` is the relevant column that you want to summarise.
 
 #### Example
 
@@ -298,7 +298,7 @@ df = DataFrame(name=repeat(["John", "Sally", "Kirk"],inner=[1],outer=[2]),
 
 x = @from i in df begin
     @group i by i.state into g
-    @select {group=g.key,mage=mean(g..age), oldest=maximum(g..age), youngest=minimum(g..age)}
+    @select {group=key(g),mage=mean(g.age), oldest=maximum(g.age), youngest=minimum(g.age)}
     @collect DataFrame
 end