Skip to content

Commit 396b1dc

Browse files
not-napoleonelasticsearchmachine
andauthored
Add summary metrics for tdigest fields (#137982)
Add support for storing summary metrics (min, max, sum, and count) along side the t-digest histogram values. These values can be sent by the user, or computed at parse time. As of this PR, there is no validation of user sent values for these summary metrics, although we should do some validation in the future. --------- Co-authored-by: elasticsearchmachine <infra-root+elasticsearchmachine@elastic.co>
1 parent 70861ee commit 396b1dc

File tree

7 files changed

+637
-162
lines changed

7 files changed

+637
-162
lines changed

test/framework/src/main/java/org/elasticsearch/test/ESTestCase.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,13 @@ public static DoubleStream randomDoubles(long streamSize) {
11361136
return random().doubles(streamSize);
11371137
}
11381138

1139+
/**
1140+
* Returns a pseudo-random double from a Gaussian distribution with mean 0.0 and standard deviation 1.0
1141+
*/
1142+
public static double randomGaussianDouble() {
1143+
return random().nextGaussian();
1144+
}
1145+
11391146
/**
11401147
* Returns a double value in the interval [start, end) if lowerInclusive is
11411148
* set to true, (start, end) otherwise.

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/TDigestFieldMapper.java

Lines changed: 74 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@
88

99
import org.apache.lucene.document.BinaryDocValuesField;
1010
import org.apache.lucene.document.Field;
11+
import org.apache.lucene.document.NumericDocValuesField;
1112
import org.apache.lucene.index.BinaryDocValues;
1213
import org.apache.lucene.index.DocValues;
1314
import org.apache.lucene.index.LeafReader;
1415
import org.apache.lucene.index.LeafReaderContext;
1516
import org.apache.lucene.search.Query;
1617
import org.apache.lucene.search.SortField;
1718
import org.apache.lucene.util.BytesRef;
18-
import org.elasticsearch.TransportVersions;
19+
import org.apache.lucene.util.NumericUtils;
1920
import org.elasticsearch.common.Explicit;
2021
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
2122
import org.elasticsearch.common.io.stream.BytesStreamOutput;
@@ -67,6 +68,10 @@ public class TDigestFieldMapper extends FieldMapper {
6768

6869
public static final String CENTROIDS_NAME = "centroids";
6970
public static final String COUNTS_NAME = "counts";
71+
public static final String SUM_FIELD_NAME = "sum";
72+
public static final String TOTAL_COUNT_FIELD_NAME = "count";
73+
public static final String MIN_FIELD_NAME = "min";
74+
public static final String MAX_FIELD_NAME = "max";
7075
public static final String CONTENT_TYPE = "tdigest";
7176

7277
private static TDigestFieldMapper toType(FieldMapper in) {
@@ -202,7 +207,7 @@ public LeafHistogramFieldData load(LeafReaderContext context) {
202207
public HistogramValues getHistogramValues() throws IOException {
203208
try {
204209
final BinaryDocValues values = DocValues.getBinary(context.reader(), fieldName);
205-
final InternalHistogramValue value = new InternalHistogramValue();
210+
final InternalTDigestValue value = new InternalTDigestValue();
206211
return new HistogramValues() {
207212

208213
@Override
@@ -234,7 +239,7 @@ public DocValuesScriptFieldFactory getScriptFieldFactory(String name) {
234239
public FormattedDocValues getFormattedValues(DocValueFormat format) {
235240
try {
236241
final BinaryDocValues values = DocValues.getBinary(context.reader(), fieldName);
237-
final InternalHistogramValue value = new InternalHistogramValue();
242+
final InternalTDigestValue value = new InternalTDigestValue();
238243
return new FormattedDocValues() {
239244
@Override
240245
public boolean advanceExact(int docId) throws IOException {
@@ -337,20 +342,44 @@ public void parse(DocumentParserContext context) throws IOException {
337342
}
338343
subParser.nextToken();
339344
// TODO: Here we should build a t-digest out of the input, based on the settings on the field
340-
TDigestParser.ParsedHistogram parsedHistogram = TDigestParser.parse(fullPath(), subParser);
345+
TDigestParser.ParsedTDigest parsedTDigest = TDigestParser.parse(fullPath(), subParser);
341346

342347
BytesStreamOutput streamOutput = new BytesStreamOutput();
343-
for (int i = 0; i < parsedHistogram.centroids().size(); i++) {
344-
long count = parsedHistogram.counts().get(i);
348+
349+
for (int i = 0; i < parsedTDigest.centroids().size(); i++) {
350+
long count = parsedTDigest.counts().get(i);
345351
assert count >= 0;
346352
// we do not add elements with count == 0
347353
if (count > 0) {
348354
streamOutput.writeVLong(count);
349-
streamOutput.writeLong(Double.doubleToRawLongBits(parsedHistogram.centroids().get(i)));
355+
streamOutput.writeDouble(parsedTDigest.centroids().get(i));
350356
}
351357
}
358+
352359
BytesRef docValue = streamOutput.bytes().toBytesRef();
353-
Field field = new BinaryDocValuesField(fullPath(), docValue);
360+
Field digestField = new BinaryDocValuesField(fullPath(), docValue);
361+
362+
// Add numeric doc values fields for the summary data
363+
NumericDocValuesField maxField = null;
364+
if (Double.isNaN(parsedTDigest.max()) == false) {
365+
maxField = new NumericDocValuesField(
366+
valuesMaxSubFieldName(fullPath()),
367+
NumericUtils.doubleToSortableLong(parsedTDigest.max())
368+
);
369+
}
370+
371+
NumericDocValuesField minField = null;
372+
if (Double.isNaN(parsedTDigest.min()) == false) {
373+
minField = new NumericDocValuesField(
374+
valuesMinSubFieldName(fullPath()),
375+
NumericUtils.doubleToSortableLong(parsedTDigest.min())
376+
);
377+
}
378+
NumericDocValuesField countField = new NumericDocValuesField(valuesCountSubFieldName(fullPath()), parsedTDigest.count());
379+
NumericDocValuesField sumField = new NumericDocValuesField(
380+
valuesSumSubFieldName(fullPath()),
381+
NumericUtils.doubleToSortableLong(parsedTDigest.sum())
382+
);
354383
if (context.doc().getByKey(fieldType().name()) != null) {
355384
throw new IllegalArgumentException(
356385
"Field ["
@@ -360,7 +389,15 @@ public void parse(DocumentParserContext context) throws IOException {
360389
+ "] doesn't support indexing multiple values for the same field in the same document"
361390
);
362391
}
363-
context.doc().addWithKey(fieldType().name(), field);
392+
context.doc().addWithKey(fieldType().name(), digestField);
393+
context.doc().add(countField);
394+
context.doc().add(sumField);
395+
if (maxField != null) {
396+
context.doc().add(maxField);
397+
}
398+
if (minField != null) {
399+
context.doc().add(minField);
400+
}
364401

365402
} catch (Exception ex) {
366403
if (ignoreMalformed.value() == false) {
@@ -390,19 +427,36 @@ public void parse(DocumentParserContext context) throws IOException {
390427
context.path().remove();
391428
}
392429

430+
private static String valuesCountSubFieldName(String fullPath) {
431+
return fullPath + "._values_count";
432+
}
433+
434+
private static String valuesSumSubFieldName(String fullPath) {
435+
return fullPath + "._values_sum";
436+
}
437+
438+
private static String valuesMinSubFieldName(String fullPath) {
439+
return fullPath + "._values_min";
440+
}
441+
442+
private static String valuesMaxSubFieldName(String fullPath) {
443+
return fullPath + "._values_max";
444+
}
445+
393446
/** re-usable {@link HistogramValue} implementation */
394-
private static class InternalHistogramValue extends HistogramValue {
447+
private static class InternalTDigestValue extends HistogramValue {
395448
double value;
396449
long count;
397450
boolean isExhausted;
451+
398452
final ByteArrayStreamInput streamInput;
399453

400-
InternalHistogramValue() {
454+
InternalTDigestValue() {
401455
streamInput = new ByteArrayStreamInput();
402456
}
403457

404458
/** reset the value for the histogram */
405-
void reset(BytesRef bytesRef) {
459+
void reset(BytesRef bytesRef) throws IOException {
406460
streamInput.reset(bytesRef.bytes, bytesRef.offset, bytesRef.length);
407461
isExhausted = false;
408462
value = 0;
@@ -412,12 +466,8 @@ void reset(BytesRef bytesRef) {
412466
@Override
413467
public boolean next() throws IOException {
414468
if (streamInput.available() > 0) {
415-
if (streamInput.getTransportVersion().onOrAfter(TransportVersions.V_8_11_X)) {
416-
count = streamInput.readVLong();
417-
} else {
418-
count = streamInput.readVInt();
419-
}
420-
value = Double.longBitsToDouble(streamInput.readLong());
469+
count = streamInput.readVLong();
470+
value = streamInput.readDouble();
421471
return true;
422472
}
423473
isExhausted = true;
@@ -447,14 +497,14 @@ protected SyntheticSourceSupport syntheticSourceSupport() {
447497
() -> new CompositeSyntheticFieldLoader(
448498
leafName(),
449499
fullPath(),
450-
new HistogramSyntheticFieldLoader(),
500+
new TDigestSyntheticFieldLoader(),
451501
new CompositeSyntheticFieldLoader.MalformedValuesLayer(fullPath())
452502
)
453503
);
454504
}
455505

456-
private class HistogramSyntheticFieldLoader implements CompositeSyntheticFieldLoader.DocValuesLayer {
457-
private final InternalHistogramValue value = new InternalHistogramValue();
506+
private class TDigestSyntheticFieldLoader implements CompositeSyntheticFieldLoader.DocValuesLayer {
507+
private final InternalTDigestValue value = new InternalTDigestValue();
458508
private BytesRef binaryValue;
459509

460510
@Override
@@ -485,9 +535,10 @@ public void write(XContentBuilder b) throws IOException {
485535
if (binaryValue == null) {
486536
return;
487537
}
488-
b.startObject();
489-
490538
value.reset(binaryValue);
539+
540+
b.startObject();
541+
// TODO: Load the summary values out of the sub-fields, if they exist
491542
b.startArray(CENTROIDS_NAME);
492543
while (value.next()) {
493544
b.value(value.value());

0 commit comments

Comments
 (0)