Skip to content

Commit 2b3613b

Browse files
authored
Improve synthetic source for tdigest field (#138121)
Follow up to #137982 to support returning min, max, and sum in the synthetic source results. This also drops support for sending the count as a parameter (and thus doesn't include the total count in the synthetic source result), which matches the behavior of the exponential histogram field.
1 parent 5320838 commit 2b3613b

File tree

4 files changed

+67
-20
lines changed

4 files changed

+67
-20
lines changed

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/TDigestFieldMapper.java

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.apache.lucene.index.DocValues;
1414
import org.apache.lucene.index.LeafReader;
1515
import org.apache.lucene.index.LeafReaderContext;
16+
import org.apache.lucene.index.NumericDocValues;
1617
import org.apache.lucene.search.Query;
1718
import org.apache.lucene.search.SortField;
1819
import org.apache.lucene.util.BytesRef;
@@ -69,7 +70,6 @@ public class TDigestFieldMapper extends FieldMapper {
6970
public static final String CENTROIDS_NAME = "centroids";
7071
public static final String COUNTS_NAME = "counts";
7172
public static final String SUM_FIELD_NAME = "sum";
72-
public static final String TOTAL_COUNT_FIELD_NAME = "count";
7373
public static final String MIN_FIELD_NAME = "min";
7474
public static final String MAX_FIELD_NAME = "max";
7575
public static final String CONTENT_TYPE = "tdigest";
@@ -506,17 +506,41 @@ protected SyntheticSourceSupport syntheticSourceSupport() {
506506
private class TDigestSyntheticFieldLoader implements CompositeSyntheticFieldLoader.DocValuesLayer {
507507
private final InternalTDigestValue value = new InternalTDigestValue();
508508
private BytesRef binaryValue;
509+
private double min;
510+
private double max;
511+
private double sum;
509512

510513
@Override
511514
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
512515
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldType().name());
516+
NumericDocValues minValues = leafReader.getNumericDocValues(valuesMinSubFieldName(fullPath()));
517+
NumericDocValues maxValues = leafReader.getNumericDocValues(valuesMaxSubFieldName(fullPath()));
518+
NumericDocValues sumValues = leafReader.getNumericDocValues(valuesSumSubFieldName(fullPath()));
513519
if (docValues == null) {
514520
// No values in this leaf
515521
binaryValue = null;
516522
return null;
517523
}
518524
return docId -> {
519525
if (docValues.advanceExact(docId)) {
526+
// we assume the summary sub-
527+
if (minValues != null) {
528+
minValues.advanceExact(docId);
529+
min = NumericUtils.sortableLongToDouble(minValues.longValue());
530+
} else {
531+
min = Double.NaN;
532+
}
533+
534+
if (maxValues != null) {
535+
maxValues.advanceExact(docId);
536+
max = NumericUtils.sortableLongToDouble(maxValues.longValue());
537+
} else {
538+
max = Double.NaN;
539+
}
540+
541+
sumValues.advanceExact(docId);
542+
sum = NumericUtils.sortableLongToDouble(sumValues.longValue());
543+
520544
binaryValue = docValues.binaryValue();
521545
return true;
522546
}
@@ -536,9 +560,17 @@ public void write(XContentBuilder b) throws IOException {
536560
return;
537561
}
538562
value.reset(binaryValue);
539-
540563
b.startObject();
564+
541565
// TODO: Load the summary values out of the sub-fields, if they exist
566+
if (Double.isNaN(min) == false) {
567+
b.field("min", min);
568+
}
569+
if (Double.isNaN(max) == false) {
570+
b.field("max", max);
571+
}
572+
b.field("sum", sum);
573+
542574
b.startArray(CENTROIDS_NAME);
543575
while (value.next()) {
544576
b.value(value.value());

x-pack/plugin/analytics/src/main/java/org/elasticsearch/xpack/analytics/mapper/TDigestParser.java

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,11 @@
2121
import static org.elasticsearch.xpack.analytics.mapper.TDigestFieldMapper.MAX_FIELD_NAME;
2222
import static org.elasticsearch.xpack.analytics.mapper.TDigestFieldMapper.MIN_FIELD_NAME;
2323
import static org.elasticsearch.xpack.analytics.mapper.TDigestFieldMapper.SUM_FIELD_NAME;
24-
import static org.elasticsearch.xpack.analytics.mapper.TDigestFieldMapper.TOTAL_COUNT_FIELD_NAME;
2524

2625
public class TDigestParser {
2726

2827
private static final ParseField COUNTS_FIELD = new ParseField(COUNTS_NAME);
2928
private static final ParseField CENTROIDS_FIELD = new ParseField(CENTROIDS_NAME);
30-
private static final ParseField TOTAL_COUNT_FIELD = new ParseField(TOTAL_COUNT_FIELD_NAME);
3129
private static final ParseField SUM_FIELD = new ParseField(SUM_FIELD_NAME);
3230
private static final ParseField MAX_FIELD = new ParseField(MAX_FIELD_NAME);
3331
private static final ParseField MIN_FIELD = new ParseField(MIN_FIELD_NAME);
@@ -37,7 +35,7 @@ public class TDigestParser {
3735
* @param centroids the centroids, guaranteed to be distinct and in increasing order
3836
* @param counts the counts, guaranteed to be non-negative and of the same length as the centroids array
3937
*/
40-
public record ParsedTDigest(List<Double> centroids, List<Long> counts, Long count, Double sum, Double min, Double max) {
38+
public record ParsedTDigest(List<Double> centroids, List<Long> counts, Double sum, Double min, Double max) {
4139
@Override
4240
public Double max() {
4341
if (max != null) {
@@ -75,11 +73,7 @@ public Double sum() {
7573
return Double.NaN;
7674
}
7775

78-
@Override
7976
public Long count() {
80-
if (count != null) {
81-
return count;
82-
}
8377
if (counts != null && counts.isEmpty() == false) {
8478
long observedCount = 0;
8579
for (Long count : counts) {
@@ -102,7 +96,6 @@ public Long count() {
10296
public static ParsedTDigest parse(String mappedFieldName, XContentParser parser) throws IOException {
10397
ArrayList<Double> centroids = null;
10498
ArrayList<Long> counts = null;
105-
Long count = null;
10699
Double sum = null;
107100
Double min = null;
108101
Double max = null;
@@ -127,10 +120,6 @@ public static ParsedTDigest parse(String mappedFieldName, XContentParser parser)
127120
token = parser.nextToken();
128121
ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, token, parser);
129122
max = parser.doubleValue();
130-
} else if (fieldName.equals(TOTAL_COUNT_FIELD.getPreferredName())) {
131-
token = parser.nextToken();
132-
ensureExpectedToken(XContentParser.Token.VALUE_NUMBER, token, parser);
133-
count = parser.longValue();
134123
} else {
135124
throw new DocumentParsingException(
136125
parser.getTokenLocation(),
@@ -169,12 +158,11 @@ public static ParsedTDigest parse(String mappedFieldName, XContentParser parser)
169158
);
170159
}
171160
if (centroids.isEmpty()) {
172-
count = 0L;
173161
sum = 0.0;
174162
min = null;
175163
max = null;
176164
}
177-
return new ParsedTDigest(centroids, counts, count, sum, min, max);
165+
return new ParsedTDigest(centroids, counts, sum, min, max);
178166
}
179167

180168
private static ArrayList<Long> getCounts(String mappedFieldName, XContentParser parser) throws IOException {

x-pack/plugin/analytics/src/test/java/org/elasticsearch/xpack/analytics/mapper/TDigestFieldMapperTests.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,9 @@ public void testArrayValueSyntheticSource() throws Exception {
434434
{
435435
expected.startArray("field");
436436
expected.startObject();
437+
expected.field("min", 1.0d);
438+
expected.field("max", 3.0d);
439+
expected.field("sum", 14.0d);
437440
expected.field("centroids", new double[] { 1, 2, 3 });
438441
expected.field("counts", new int[] { 1, 2, 3 });
439442
expected.endObject();
@@ -458,10 +461,17 @@ private static Map<String, Object> generateRandomFieldValues(int maxVals) {
458461
}
459462
List<Double> centroids = new ArrayList<>();
460463
List<Long> counts = new ArrayList<>();
464+
double sum = 0.0;
461465
for (Centroid c : digest.centroids()) {
462466
centroids.add(c.mean());
463467
counts.add(c.count());
468+
sum += c.mean() * c.count();
464469
}
470+
double min = digest.getMin();
471+
double max = digest.getMax();
472+
value.put("min", min);
473+
value.put("max", max);
474+
value.put("sum", sum);
465475
value.put("centroids", centroids);
466476
value.put("counts", counts);
467477

x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/analytics/t_digest_fieldtype.yml

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ setup:
1919
- '{"index": {"_id": 2}}'
2020
- '{"latency": {"centroids" : [0, 0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 2, 5, 10, 1, 8]}}'
2121
- '{"index": {"_id": 3}}'
22-
- '{"latency": {"sum": 8.6, "count": 29, "min": 0, "max": 0.5, "centroids" : [0, 0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 2, 5, 10, 1, 8]}}'
22+
- '{"latency": {"sum": 8.6, "min": 0, "max": 0.5, "centroids" : [0, 0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 2, 5, 10, 1, 8]}}'
2323
---
2424
"TDigest requires values in increasing order":
2525
- do:
@@ -47,11 +47,9 @@ TDigest get:
4747
- match:
4848
_source:
4949
latency:
50-
# Here the summary fields were in the source, so we get them back
5150
min: 0
5251
max: 0.5
5352
sum: 8.6
54-
count: 29
5553
centroids: [ 0, 0.1, 0.2, 0.3, 0.4, 0.5 ]
5654
counts: [ 3, 2, 5, 10, 1, 8 ]
5755

@@ -89,6 +87,9 @@ TDigest with synthetic source:
8987
- match:
9088
_source:
9189
latency:
90+
min: 0.1
91+
max: 0.5
92+
sum: 16.4
9293
centroids: [ 0.1, 0.2, 0.3, 0.4, 0.5 ]
9394
counts: [ 3, 7, 23, 12, 6 ]
9495

@@ -99,6 +100,10 @@ TDigest with synthetic source:
99100
- match:
100101
_source:
101102
latency:
103+
min: 0.0
104+
max: 0.5
105+
# $%#@! floating points...
106+
sum: 8.600000000000001
102107
centroids: [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5 ]
103108
counts: [ 3, 2, 5, 10, 1, 8 ]
104109
---
@@ -124,7 +129,7 @@ TDigest with synthetic source and explicit summary fields:
124129
refresh: true
125130
body:
126131
- '{"index": {"_id": 1}}'
127-
- '{"latency": {"sum": 8.6, "count": 29, "min": 0, "max": 0.5, "centroids" : [0, 0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 2, 5, 10, 1, 8]}}'
132+
- '{"latency": {"sum": 8.6, "min": 0, "max": 0.5, "centroids" : [0, 0.1, 0.2, 0.3, 0.4, 0.5], "counts" : [3, 2, 5, 10, 1, 8]}}'
128133

129134
- do:
130135
get:
@@ -133,6 +138,10 @@ TDigest with synthetic source and explicit summary fields:
133138
- match:
134139
_source:
135140
latency:
141+
# Note that unlike the stored source case, we get back a float here
142+
min: 0.0
143+
max: 0.5
144+
sum: 8.6
136145
centroids: [ 0.0, 0.1, 0.2, 0.3, 0.4, 0.5 ]
137146
counts: [ 3, 2, 5, 10, 1, 8 ]
138147

@@ -168,6 +177,10 @@ TDigest with synthetic source and zero counts:
168177
- match:
169178
_source:
170179
latency:
180+
# Note that we're storing 0.1 as the min, even though it's count is 0.
181+
min: 0.1
182+
max: 0.5
183+
sum: 3.8000000000000007
171184
centroids: [ 0.2, 0.4 ]
172185
counts: [ 7, 6 ]
173186

@@ -223,6 +236,9 @@ histogram with synthetic source and ignore_malformed:
223236
- match:
224237
_source:
225238
latency: [ {
239+
min: 2.0,
240+
max: 2.0,
241+
sum: 4.0,
226242
"centroids": [ 2.0 ],
227243
"counts": [ 2 ]
228244
},
@@ -260,6 +276,7 @@ TDigest with synthetic source and empty digest:
260276
- match:
261277
_source:
262278
latency:
279+
sum: 0.0
263280
centroids: [ ]
264281
counts: [ ]
265282

0 commit comments

Comments
 (0)