Add missing_bucket option in the composite agg

This change adds a new option to the composite aggregation named `missing_bucket`. This option can be set by source and dictates whether documents without a value for the source should be ignored. When set to true, documents without a value for a field emits an explicit `null` value which is then added in the composite bucket. The `missing` option that allows to set an explicit value (instead of `null`) is deprecated in this change and will be removed in a follow up (only in 7.x). This commit also changes how the big arrays are allocated, instead of reserving the provided `size` for all sources they are created with a small intial size and they grow depending on the number of buckets created by the aggregation: Closes elastic#29380
jimczi · Apr 11, 2018 · 962bdb4 · 962bdb4
1 parent 4918924
commit 962bdb4
Show file tree

Hide file tree

Showing 25 changed files with 913 additions and 223 deletions.
diff --git a/docs/reference/aggregations/bucket/composite-aggregation.asciidoc b/docs/reference/aggregations/bucket/composite-aggregation.asciidoc
@@ -51,6 +51,8 @@ POST /sales/docs/_bulk?refresh
 {"product": "mad max", "price": "27", "timestamp": "2017-05-10T07:07"}
 {"index":{"_id":4}}
 {"product": "apocalypse now", "price": "10", "timestamp": "2017-05-11T08:35"}
+{"index":{"_id":5}}
+{"product": "apocalypse now", "price": "10", "timestamp": "2017-05-11T08:35"}
 -------------------------------------------------
 // NOTCONSOLE
 // TESTSETUP
@@ -348,6 +350,34 @@ GET /_search
 \... will sort the composite bucket in descending order when comparing values from the `date_histogram` source
 and in ascending order when comparing values from the `terms` source.
 
+====== Missing bucket
+
+By default documents without a value for a given source are ignored.
+It is possible to include them in the response by setting `ignore_missing` to
+`true` (defaults to `false`):
+
+[source,js]
+--------------------------------------------------
+GET /_search
+{
+    "aggs" : {
+        "my_buckets": {
+            "composite" : {
+                "sources" : [
+                    { "product_name": { "terms" : { "field": "product", "missing_bucket": true } } }
+                ]
+            }
+        }
+     }
+}
+--------------------------------------------------
+// CONSOLE
+
+In the example above the source `product_name` will emit an explicit `null` value
+for documents without a value for the field `product`.
+The `order` specified in the source dictates whether the `null` values should rank
+first (ascending order, `asc`) or last (descending order, `desc`).
+
 ==== Size
 
 The `size` parameter can be set to define how many composite buckets should be returned.

diff --git a/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/230_composite.yml b/rest-api-spec/src/main/resources/rest-api-spec/test/search.aggregation/230_composite.yml
@@ -323,3 +323,32 @@ setup:
   - length: { aggregations.test.buckets: 2 }
   - length: { aggregations.test.after_key: 1 }
   - match: { aggregations.test.after_key.keyword:  "foo" }
+
+---
+"Composite aggregation and array size":
+  - skip:
+      version: " - 6.99.99"
+      reason:  starting in 7.0 the composite sources do not allocate arrays eagerly.
+
+  - do:
+        search:
+          index: test
+          body:
+            aggregations:
+              test:
+                composite:
+                  size: 1000000000
+                  sources: [
+                    {
+                      "keyword": {
+                        "terms": {
+                          "field": "keyword",
+                        }
+                      }
+                    }
+                  ]
+
+  - match: {hits.total: 6}
+  - length: { aggregations.test.buckets: 2 }
+  - length: { aggregations.test.after_key: 1 }
+  - match: { aggregations.test.after_key.keyword:  "foo" }
diff --git a/.../main/java/org/elasticsearch/search/aggregations/bucket/composite/BinaryValuesSource.java b/.../main/java/org/elasticsearch/search/aggregations/bucket/composite/BinaryValuesSource.java
@@ -24,49 +24,91 @@
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
 import org.elasticsearch.common.CheckedFunction;
+import org.elasticsearch.common.lease.Releasables;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.ObjectArray;
 import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
-import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.StringFieldType;
-import org.elasticsearch.index.mapper.TextFieldMapper;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.aggregations.LeafBucketCollector;
 
 import java.io.IOException;
+import java.util.function.LongConsumer;
 
 /**
  * A {@link SingleDimensionValuesSource} for binary source ({@link BytesRef}).
  */
 class BinaryValuesSource extends SingleDimensionValuesSource<BytesRef> {
     private final CheckedFunction<LeafReaderContext, SortedBinaryDocValues, IOException> docValuesFunc;
-    private final BytesRef[] values;
+    private ObjectArray<BytesRef> values;
+    private ObjectArray<BytesRefBuilder> valueBuilders;
     private BytesRef currentValue;
 
-    BinaryValuesSource(MappedFieldType fieldType, CheckedFunction<LeafReaderContext, SortedBinaryDocValues, IOException> docValuesFunc,
-                       DocValueFormat format, Object missing, int size, int reverseMul) {
-        super(format, fieldType, missing, size, reverseMul);
+    BinaryValuesSource(BigArrays bigArrays, LongConsumer breakerConsumer,
+                       MappedFieldType fieldType, CheckedFunction<LeafReaderContext, SortedBinaryDocValues, IOException> docValuesFunc,
+                       DocValueFormat format, boolean missingBucket, Object missing, int size, int reverseMul) {
+        super(bigArrays, breakerConsumer, format, fieldType, missingBucket, missing, size, reverseMul);
         this.docValuesFunc = docValuesFunc;
-        this.values = new BytesRef[size];
+        this.values = bigArrays.newObjectArray(Math.min(size, 100));
+        this.valueBuilders = bigArrays.newObjectArray(Math.min(size, 100));
     }
 
     @Override
     public void copyCurrent(int slot) {
-        values[slot] = BytesRef.deepCopyOf(currentValue);
+        values =  bigArrays.grow(values, slot+1);
+        valueBuilders = bigArrays.grow(valueBuilders, slot+1);
+        BytesRefBuilder builder = valueBuilders.get(slot);
+        int byteSize = builder == null ? 0 : builder.bytes().length;
+        if (builder == null) {
+            builder = new BytesRefBuilder();
+            valueBuilders.set(slot, builder);
+        }
+        if (missingBucket && currentValue == null) {
+            values.set(slot, null);
+        } else {
+            assert currentValue != null;
+            builder.copyBytes(currentValue);
+            breakerConsumer.accept(builder.bytes().length - byteSize);
+            values.set(slot, builder.get());
+        }
     }
 
     @Override
     public int compare(int from, int to) {
-        return compareValues(values[from], values[to]);
+        if (missingBucket) {
+            if (values.get(from) == null) {
+                return values.get(to) == null ? 0 : -1 * reverseMul;
+            } else if (values.get(to) == null) {
+                return reverseMul;
+            }
+        }
+        return compareValues(values.get(from), values.get(to));
     }
 
     @Override
     int compareCurrent(int slot) {
-        return compareValues(currentValue, values[slot]);
+        if (missingBucket) {
+            if (currentValue == null) {
+                return values.get(slot) == null ? 0 : -1 * reverseMul;
+            } else if (values.get(slot) == null) {
+                return reverseMul;
+            }
+        }
+        return compareValues(currentValue, values.get(slot));
     }
 
     @Override
     int compareCurrentWithAfter() {
+        if (missingBucket) {
+            if (currentValue == null) {
+                return afterValue == null ? 0 : -1 * reverseMul;
+            } else if (afterValue == null) {
+                return reverseMul;
+            }
+        }
         return compareValues(currentValue, afterValue);
     }
 
@@ -76,7 +118,9 @@ int compareValues(BytesRef v1, BytesRef v2) {
 
     @Override
     void setAfter(Comparable<?> value) {
-        if (value.getClass() == String.class) {
+        if (missingBucket && value == null) {
+            afterValue = null;
+        } else if (value.getClass() == String.class) {
             afterValue = format.parseBytesRef(value.toString());
         } else {
             throw new IllegalArgumentException("invalid value, expected string, got " + value.getClass().getSimpleName());
@@ -85,7 +129,7 @@ void setAfter(Comparable<?> value) {
 
     @Override
     BytesRef toComparable(int slot) {
-        return values[slot];
+       return values.get(slot);
     }
 
     @Override
@@ -100,6 +144,9 @@ public void collect(int doc, long bucket) throws IOException {
                         currentValue = dvs.nextValue();
                         next.collect(doc, bucket);
                     }
+                } else if (missingBucket) {
+                    currentValue = null;
+                    next.collect(doc, bucket);
                 }
             }
         };
@@ -130,5 +177,7 @@ SortedDocsProducer createSortedDocsProducerOrNull(IndexReader reader, Query quer
     }
 
     @Override
-    public void close() {}
+    public void close() {
+        Releasables.close(values, valueBuilders);
+    }
 }
diff --git a/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/BitArray.java b/server/src/main/java/org/elasticsearch/search/aggregations/bucket/composite/BitArray.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.search.aggregations.bucket.composite;
+
+import org.elasticsearch.common.lease.Releasable;
+import org.elasticsearch.common.lease.Releasables;
+import org.elasticsearch.common.util.BigArrays;
+import org.elasticsearch.common.util.LongArray;
+
+/**
+ * A bit array that is implemented using a growing {@link LongArray}
+ * created from {@link BigArrays}.
+ * The underlying long array grows lazily based on the biggest index
+ * that needs to be set.
+ */
+final class BitArray implements Releasable {
+    private final BigArrays bigArrays;
+    private LongArray bits;
+
+    BitArray(BigArrays bigArrays, int initialSize) {
+        this.bigArrays = bigArrays;
+        this.bits = bigArrays.newLongArray(initialSize, true);
+    }
+
+    public void set(int index) {
+        fill(index, true);
+    }
+
+    public void clear(int index) {
+        fill(index, false);
+    }
+
+    public boolean get(int index) {
+        int wordNum = index >> 6;
+        long bitmask = 1L << index;
+        return (bits.get(wordNum) & bitmask) != 0;
+    }
+
+    private void fill(int index, boolean bit) {
+        int wordNum = index >> 6;
+        bits = bigArrays.grow(bits,wordNum+1);
+        long bitmask = 1L << index;
+        long value = bit ? bits.get(wordNum) | bitmask : bits.get(wordNum) & ~bitmask;
+        bits.set(wordNum, value);
+    }
+
+    @Override
+    public void close() {
+        Releasables.close(bits);
+    }
+}
diff --git a/...ain/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregation.java b/...ain/java/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregation.java
@@ -19,7 +19,6 @@
 
 package org.elasticsearch.search.aggregations.bucket.composite;
 
-import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.search.aggregations.bucket.MultiBucketsAggregation;
 
@@ -66,11 +65,7 @@ static XContentBuilder toXContentFragment(CompositeAggregation aggregation, XCon
     static void buildCompositeMap(String fieldName, Map<String, Object> composite, XContentBuilder builder) throws IOException {
         builder.startObject(fieldName);
         for (Map.Entry<String, Object> entry : composite.entrySet()) {
-            if (entry.getValue().getClass() == BytesRef.class) {
-                builder.field(entry.getKey(), ((BytesRef) entry.getValue()).utf8ToString());
-            } else {
-                builder.field(entry.getKey(), entry.getValue());
-            }
+            builder.field(entry.getKey(), entry.getValue());
         }
         builder.endObject();
     }

diff --git a/...a/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregationBuilder.java b/...a/org/elasticsearch/search/aggregations/bucket/composite/CompositeAggregationBuilder.java
@@ -170,7 +170,9 @@ protected AggregatorFactory<?> doBuild(SearchContext context, AggregatorFactory<
                     throw new IllegalArgumentException("Missing value for [after." + sources.get(i).name() + "]");
                 }
                 Object obj = after.get(sourceName);
-                if (obj instanceof Comparable) {
+                if (configs[i].missingBucket() && obj == null) {
+                    values[i] = null;
+                } else if (obj instanceof Comparable) {
                     values[i] = (Comparable<?>) obj;
                 } else {
                     throw new IllegalArgumentException("Invalid value for [after." + sources.get(i).name() +