Span in PPL statsByClause could be specified after fields (#2720) (#2810

) (cherry picked from commit c063d5e) Signed-off-by: Lantao Jin <ltjin@amazon.com> Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Peng Huo <penghuo@gmail.com>
opensearch-project · Jul 10, 2024 · 49fbb6c · 49fbb6c
1 parent 9c3fc29
commit 49fbb6c
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 5 deletions.
diff --git a/docs/user/ppl/cmd/stats.rst b/docs/user/ppl/cmd/stats.rst
@@ -43,7 +43,7 @@ stats <aggregation>... [by-clause]
  * Description: The by clause could be the fields and expressions like scalar functions and aggregation functions. Besides, the span clause can be used to split specific field into buckets in the same interval, the stats then does the aggregation by these span buckets.
  * Default: If no <by-clause> is specified, the stats command returns only one row, which is the aggregation over the entire result set.
 
-* span-expression: optional.
+* span-expression: optional, at most one.
 
  * Syntax: span(field_expr, interval_expr)
  * Description: The unit of the interval expression is the natural unit by default. If the field is a date and time type field, and the interval is in date/time units, you will need to specify the unit in the interval expression. For example, to split the field ``age`` into buckets by 10 years, it looks like ``span(age, 10)``. And here is another example of time span, the span to split a ``timestamp`` field into hourly intervals, it looks like ``span(timestamp, 1h)``.
@@ -424,6 +424,20 @@ PPL query::
     | 1     | 35         | M        |
     +-------+------------+----------+
 
+Span will always be the first grouping key whatever order you specify.
+
+PPL query::
+
+    os> source=accounts | stats count() as cnt by gender, span(age, 5) as age_span
+    fetched rows / total rows = 3/3
+    +-------+------------+----------+
+    | cnt   | age_span   | gender   |
+    |-------+------------+----------|
+    | 1     | 25         | F        |
+    | 2     | 30         | M        |
+    | 1     | 35         | M        |
+    +-------+------------+----------+
+
 Example 10: Calculate the count and get email list by a gender and span
 =======================================================================
 

diff --git a/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java b/integ-test/src/test/java/org/opensearch/sql/ppl/StatsCommandIT.java
@@ -11,7 +11,9 @@
 import static org.opensearch.sql.util.MatcherUtils.rows;
 import static org.opensearch.sql.util.MatcherUtils.schema;
 import static org.opensearch.sql.util.MatcherUtils.verifyDataRows;
+import static org.opensearch.sql.util.MatcherUtils.verifyDataRowsInOrder;
 import static org.opensearch.sql.util.MatcherUtils.verifySchema;
+import static org.opensearch.sql.util.MatcherUtils.verifySchemaInOrder;
 
 import java.io.IOException;
 import org.json.JSONObject;
@@ -190,6 +192,54 @@ public void testStatsAliasedSpan() throws IOException {
     verifyDataRows(response, rows(1, 20), rows(6, 30));
   }
 
+  @Test
+  public void testStatsBySpanAndMultipleFields() throws IOException {
+    JSONObject response =
+        executeQuery(
+            String.format(
+                "source=%s | stats count() by span(age,10), gender, state", TEST_INDEX_BANK));
+    verifySchemaInOrder(
+        response,
+        schema("count()", null, "integer"),
+        schema("span(age,10)", null, "integer"),
+        schema("gender", null, "string"),
+        schema("state", null, "string"));
+    verifyDataRowsInOrder(
+        response,
+        rows(1, 20, "f", "VA"),
+        rows(1, 30, "f", "IN"),
+        rows(1, 30, "f", "PA"),
+        rows(1, 30, "m", "IL"),
+        rows(1, 30, "m", "MD"),
+        rows(1, 30, "m", "TN"),
+        rows(1, 30, "m", "WA"));
+  }
+
+  @Test
+  public void testStatsByMultipleFieldsAndSpan() throws IOException {
+    // Use verifySchemaInOrder() and verifyDataRowsInOrder() to check that the span column is always
+    // the first column in result whatever the order of span in query is first or last one
+    JSONObject response =
+        executeQuery(
+            String.format(
+                "source=%s | stats count() by gender, state, span(age,10)", TEST_INDEX_BANK));
+    verifySchemaInOrder(
+        response,
+        schema("count()", null, "integer"),
+        schema("span(age,10)", null, "integer"),
+        schema("gender", null, "string"),
+        schema("state", null, "string"));
+    verifyDataRowsInOrder(
+        response,
+        rows(1, 20, "f", "VA"),
+        rows(1, 30, "f", "IN"),
+        rows(1, 30, "f", "PA"),
+        rows(1, 30, "m", "IL"),
+        rows(1, 30, "m", "MD"),
+        rows(1, 30, "m", "TN"),
+        rows(1, 30, "m", "WA"));
+  }
+
   @Test
   public void testStatsPercentile() throws IOException {
     JSONObject response =

diff --git a/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java b/integ-test/src/test/java/org/opensearch/sql/util/MatcherUtils.java
@@ -144,6 +144,16 @@ public static void verifySchema(JSONObject response, Matcher<JSONObject>... matc
     }
   }
 
+  @SafeVarargs
+  public static void verifySchemaInOrder(JSONObject response, Matcher<JSONObject>... matchers) {
+    try {
+      verifyInOrder(response.getJSONArray("schema"), matchers);
+    } catch (Exception e) {
+      LOG.error(String.format("verify schema failed, response: %s", response.toString()), e);
+      throw e;
+    }
+  }
+
   @SafeVarargs
   public static void verifyDataRows(JSONObject response, Matcher<JSONArray>... matchers) {
     verify(response.getJSONArray("datarows"), matchers);

diff --git a/ppl/src/main/antlr/OpenSearchPPLParser.g4 b/ppl/src/main/antlr/OpenSearchPPLParser.g4
@@ -188,6 +188,7 @@ statsByClause
    : BY fieldList
    | BY bySpanClause
    | BY bySpanClause COMMA fieldList
+   | BY fieldList COMMA bySpanClause
    ;
 
 bySpanClause

diff --git a/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java b/ppl/src/test/java/org/opensearch/sql/ppl/parser/AstBuilderTest.java
@@ -319,11 +319,26 @@ public void testStatsCommandWithSpan() {
             exprList(alias("f1", field("f1")), alias("f2", field("f2"))),
             alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
             defaultStatsArgs()));
-  }
 
-  @Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class)
-  public void throwExceptionIfSpanInGroupByList() {
-    plan("source=t | stats avg(price) by f1, f2, span(timestamp, 1h)");
+    assertEqual(
+        "source=t | stats avg(price) by b, span(timestamp, 1h)",
+        agg(
+            relation("t"),
+            exprList(alias("avg(price)", aggregate("avg", field("price")))),
+            emptyList(),
+            exprList(alias("b", field("b"))),
+            alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
+            defaultStatsArgs()));
+
+    assertEqual(
+        "source=t | stats avg(price) by f1, f2, span(timestamp, 1h)",
+        agg(
+            relation("t"),
+            exprList(alias("avg(price)", aggregate("avg", field("price")))),
+            emptyList(),
+            exprList(alias("f1", field("f1")), alias("f2", field("f2"))),
+            alias("span(timestamp,1h)", span(field("timestamp"), intLiteral(1), SpanUnit.H)),
+            defaultStatsArgs()));
   }
 
   @Test(expected = org.opensearch.sql.common.antlr.SyntaxCheckException.class)