@@ -116,7 +116,7 @@ public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String,
116
116
for (ObjectObjectCursor <String , CollectionStatistics > c : fieldStatistics ) {
117
117
out .writeString (c .key );
118
118
CollectionStatistics statistics = c .value ;
119
- assert statistics .maxDoc () >= 0 ;
119
+ assert statistics .maxDoc () > 0 ;
120
120
out .writeVLong (statistics .maxDoc ());
121
121
if (out .getVersion ().onOrAfter (Version .V_7_0_0 )) {
122
122
// stats are always positive numbers
@@ -156,8 +156,8 @@ static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamIn
156
156
final String field = in .readString ();
157
157
assert field != null ;
158
158
final long maxDoc = in .readVLong ();
159
- final long docCount ;
160
- final long sumTotalTermFreq ;
159
+ long docCount ;
160
+ long sumTotalTermFreq ;
161
161
final long sumDocFreq ;
162
162
if (in .getVersion ().onOrAfter (Version .V_7_0_0 )) {
163
163
// stats are always positive numbers
@@ -168,6 +168,26 @@ static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamIn
168
168
docCount = subOne (in .readVLong ());
169
169
sumTotalTermFreq = subOne (in .readVLong ());
170
170
sumDocFreq = subOne (in .readVLong ());
171
+ if (sumTotalTermFreq == -1L ) {
172
+ // Lucene 7 and earlier used -1 to denote that this information wasn't stored by the codec
173
+ // or that this field omitted term frequencies and positions. It used docFreq as fallback in that case
174
+ // when calculating similarities. See LUCENE-8007 for more information.
175
+ sumTotalTermFreq = sumDocFreq ;
176
+ }
177
+ if (docCount == -1L ) {
178
+ // Lucene 7 and earlier used -1 to denote that this information wasn't stored by the codec
179
+ // It used maxDoc as fallback in that case when calculating similarities. See LUCENE-8007 for more information.
180
+ docCount = maxDoc ;
181
+ }
182
+ if (docCount == 0L ) {
183
+ // empty stats object (LUCENE-8020)
184
+ assert maxDoc == 0 && docCount == 0 && sumTotalTermFreq == 0 && sumDocFreq == 0 :
185
+ " maxDoc:" + maxDoc +
186
+ " docCount:" + docCount +
187
+ " sumTotalTermFreq:" + sumTotalTermFreq +
188
+ " sumDocFreq:" + sumDocFreq ;
189
+ continue ;
190
+ }
171
191
}
172
192
CollectionStatistics stats = new CollectionStatistics (field , maxDoc , docCount , sumTotalTermFreq , sumDocFreq );
173
193
fieldStatistics .put (field , stats );
@@ -187,10 +207,18 @@ static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOExc
187
207
BytesRef term = terms [i ].bytes ();
188
208
final long docFreq = in .readVLong ();
189
209
assert docFreq >= 0 ;
190
- final long totalTermFreq = subOne (in .readVLong ());
210
+ long totalTermFreq = subOne (in .readVLong ());
191
211
if (docFreq == 0 ) {
192
212
continue ;
193
213
}
214
+ if (in .getVersion ().before (Version .V_7_0_0 )) {
215
+ if (totalTermFreq == -1L ) {
216
+ // Lucene 7 and earlier used -1 to denote that this information isn't stored by the codec
217
+ // or that this field omits term frequencies and positions. It used docFreq as fallback in that case
218
+ // when calculating similarities. See LUCENE-8007 for more information.
219
+ totalTermFreq = docFreq ;
220
+ }
221
+ }
194
222
termStatistics [i ] = new TermStatistics (term , docFreq , totalTermFreq );
195
223
}
196
224
}
0 commit comments