Skip to content

Commit 8082bbb

Browse files
committed
Fix term stats when talking to ES 6 (#75735)
In a mixed 6.x and 7.x cluster, a search that uses dfs_query_then_fetch can cause a transport serialization errors. This is related to https://issues.apache.org/jira/browse/LUCENE-8007, which was introduced in Lucene 8 and adds stricter checks to TermStatistics and CollectionStatistics, and https://issues.apache.org/jira/browse/LUCENE-8020, which was introduced in Lucene 8 and avoids bogus term stats (e.g. docfreq=0). Co-authored-by: Julie Tibshirani [email protected] Closes #75349
1 parent 21b5a2d commit 8082bbb

File tree

6 files changed

+176
-7
lines changed

6 files changed

+176
-7
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
"Perform a dfs_query_then_fetch search on a keyword field":
3+
- do:
4+
search:
5+
search_type: dfs_query_then_fetch
6+
index: keyword_index
7+
rest_total_hits_as_int: true
8+
body:
9+
query:
10+
match:
11+
field:
12+
query: value
13+
14+
- match: { hits.total: 3 }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
---
2+
"Perform a dfs_query_then_fetch search on a keyword field":
3+
- do:
4+
indices.create:
5+
index: keyword_index
6+
body:
7+
mappings:
8+
properties:
9+
field:
10+
type: keyword
11+
- do:
12+
index:
13+
index: keyword_index
14+
body:
15+
field: value
16+
refresh: true
17+
18+
- do:
19+
index:
20+
index: keyword_index
21+
body:
22+
field: value
23+
refresh: true
24+
25+
- do:
26+
index:
27+
index: keyword_index
28+
body:
29+
field: value
30+
refresh: true
31+
32+
- do:
33+
search:
34+
search_type: dfs_query_then_fetch
35+
index: keyword_index
36+
rest_total_hits_as_int: true
37+
body:
38+
query:
39+
match:
40+
field:
41+
query: value
42+
43+
- match: { hits.total: 3 }
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
---
2+
"Perform a dfs_query_then_fetch search on a keyword field":
3+
- do:
4+
search:
5+
search_type: dfs_query_then_fetch
6+
index: keyword_index
7+
rest_total_hits_as_int: true
8+
body:
9+
query:
10+
match:
11+
field:
12+
query: value
13+
14+
- match: { hits.total: 3 }

server/src/main/java/org/elasticsearch/search/dfs/AggregatedDfs.java

+15-3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
import org.apache.lucene.index.Term;
1515
import org.apache.lucene.search.CollectionStatistics;
1616
import org.apache.lucene.search.TermStatistics;
17+
import org.apache.lucene.util.BytesRef;
18+
import org.elasticsearch.Version;
1719
import org.elasticsearch.common.collect.HppcMaps;
1820
import org.elasticsearch.common.io.stream.StreamInput;
1921
import org.elasticsearch.common.io.stream.StreamOutput;
@@ -32,9 +34,19 @@ public AggregatedDfs(StreamInput in) throws IOException {
3234
termStatistics = HppcMaps.newMap(size);
3335
for (int i = 0; i < size; i++) {
3436
Term term = new Term(in.readString(), in.readBytesRef());
35-
TermStatistics stats = new TermStatistics(in.readBytesRef(),
36-
in.readVLong(),
37-
DfsSearchResult.subOne(in.readVLong()));
37+
BytesRef term2 = in.readBytesRef();
38+
final long docFreq = in.readVLong();
39+
assert docFreq >= 0;
40+
long totalTermFreq = DfsSearchResult.subOne(in.readVLong());
41+
if (in.getVersion().before(Version.V_7_0_0)) {
42+
if (totalTermFreq == -1L) {
43+
// Lucene 7 and earlier used -1 to denote that this information wasn't stored by the codec
44+
// or that this field omitted term frequencies and positions. It used docFreq as fallback in that case
45+
// when calculating similarities. See LUCENE-8007 for more information.
46+
totalTermFreq = docFreq;
47+
}
48+
}
49+
TermStatistics stats = new TermStatistics(term2, docFreq, totalTermFreq);
3850
termStatistics.put(term, stats);
3951
}
4052
fieldStatistics = DfsSearchResult.readFieldStats(in);

server/src/main/java/org/elasticsearch/search/dfs/DfsSearchResult.java

+32-4
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String,
116116
for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) {
117117
out.writeString(c.key);
118118
CollectionStatistics statistics = c.value;
119-
assert statistics.maxDoc() >= 0;
119+
assert statistics.maxDoc() > 0;
120120
out.writeVLong(statistics.maxDoc());
121121
if (out.getVersion().onOrAfter(Version.V_7_0_0)) {
122122
// stats are always positive numbers
@@ -156,8 +156,8 @@ static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamIn
156156
final String field = in.readString();
157157
assert field != null;
158158
final long maxDoc = in.readVLong();
159-
final long docCount;
160-
final long sumTotalTermFreq;
159+
long docCount;
160+
long sumTotalTermFreq;
161161
final long sumDocFreq;
162162
if (in.getVersion().onOrAfter(Version.V_7_0_0)) {
163163
// stats are always positive numbers
@@ -168,6 +168,26 @@ static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamIn
168168
docCount = subOne(in.readVLong());
169169
sumTotalTermFreq = subOne(in.readVLong());
170170
sumDocFreq = subOne(in.readVLong());
171+
if (sumTotalTermFreq == -1L) {
172+
// Lucene 7 and earlier used -1 to denote that this information wasn't stored by the codec
173+
// or that this field omitted term frequencies and positions. It used docFreq as fallback in that case
174+
// when calculating similarities. See LUCENE-8007 for more information.
175+
sumTotalTermFreq = sumDocFreq;
176+
}
177+
if (docCount == -1L) {
178+
// Lucene 7 and earlier used -1 to denote that this information wasn't stored by the codec
179+
// It used maxDoc as fallback in that case when calculating similarities. See LUCENE-8007 for more information.
180+
docCount = maxDoc;
181+
}
182+
if (docCount == 0L) {
183+
// empty stats object (LUCENE-8020)
184+
assert maxDoc == 0 && docCount == 0 && sumTotalTermFreq == 0 && sumDocFreq == 0:
185+
" maxDoc:" + maxDoc +
186+
" docCount:" + docCount +
187+
" sumTotalTermFreq:" + sumTotalTermFreq +
188+
" sumDocFreq:" + sumDocFreq;
189+
continue;
190+
}
171191
}
172192
CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
173193
fieldStatistics.put(field, stats);
@@ -187,10 +207,18 @@ static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOExc
187207
BytesRef term = terms[i].bytes();
188208
final long docFreq = in.readVLong();
189209
assert docFreq >= 0;
190-
final long totalTermFreq = subOne(in.readVLong());
210+
long totalTermFreq = subOne(in.readVLong());
191211
if (docFreq == 0) {
192212
continue;
193213
}
214+
if (in.getVersion().before(Version.V_7_0_0)) {
215+
if (totalTermFreq == -1L) {
216+
// Lucene 7 and earlier used -1 to denote that this information isn't stored by the codec
217+
// or that this field omits term frequencies and positions. It used docFreq as fallback in that case
218+
// when calculating similarities. See LUCENE-8007 for more information.
219+
totalTermFreq = docFreq;
220+
}
221+
}
194222
termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq);
195223
}
196224
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.search.dfs;
10+
11+
import com.carrotsearch.hppc.ObjectObjectHashMap;
12+
13+
import org.apache.lucene.search.CollectionStatistics;
14+
import org.elasticsearch.Version;
15+
import org.elasticsearch.common.bytes.BytesReference;
16+
import org.elasticsearch.common.io.stream.BytesStreamOutput;
17+
import org.elasticsearch.common.io.stream.StreamInput;
18+
import org.elasticsearch.test.ESTestCase;
19+
import org.elasticsearch.test.VersionUtils;
20+
21+
import java.io.IOException;
22+
23+
public class DfsSearchResultTests extends ESTestCase {
24+
25+
/**
26+
* checks inputs from 6.x that are difficult to simulate in a BWC mixed-cluster test, in particular the case
27+
* where docCount == -1L which does not occur with the codecs that we typically use.
28+
*/
29+
public void test6xSerialization() throws IOException {
30+
Version version = VersionUtils.randomVersionBetween(random(), Version.V_6_8_0, Version.V_6_8_18);
31+
BytesStreamOutput os = new BytesStreamOutput();
32+
os.setVersion(version);
33+
os.writeVInt(1);
34+
String field = randomAlphaOfLength(10);
35+
os.writeString(field);
36+
long maxDoc = randomIntBetween(1, 5);
37+
os.writeVLong(maxDoc);
38+
long docCount = randomBoolean() ? -1 : randomIntBetween(1, (int) maxDoc);
39+
os.writeVLong(DfsSearchResult.addOne(docCount));
40+
long sumTotalTermFreq = randomBoolean() ? -1 : randomIntBetween(20, 30);
41+
os.writeVLong(DfsSearchResult.addOne(sumTotalTermFreq));
42+
long sumDocFreq = sumTotalTermFreq == -1 ? randomIntBetween(20, 30) : randomIntBetween(20, (int) sumTotalTermFreq);
43+
os.writeVLong(DfsSearchResult.addOne(sumDocFreq));
44+
45+
try (StreamInput input = StreamInput.wrap(BytesReference.toBytes(os.bytes()))) {
46+
input.setVersion(version);
47+
ObjectObjectHashMap<String, CollectionStatistics> stats = DfsSearchResult.readFieldStats(input);
48+
assertEquals(stats.size(), 1);
49+
assertNotNull(stats.get(field));
50+
CollectionStatistics cs = stats.get(field);
51+
assertEquals(field, cs.field());
52+
assertEquals(maxDoc, cs.maxDoc());
53+
assertEquals(docCount == -1 ? maxDoc : docCount, cs.docCount());
54+
assertEquals(sumDocFreq, cs.sumDocFreq());
55+
assertEquals(sumTotalTermFreq == -1 ? sumDocFreq : sumTotalTermFreq, cs.sumTotalTermFreq());
56+
}
57+
}
58+
}

0 commit comments

Comments
 (0)