Skip to content

Commit 9d6e19f

Browse files
ebartkuswgtmac
authored andcommitted
GH-3172: Do not drop blocks with some null values if DictionaryFilter is applied for UserDefinedPredicate which keeps null values (#3173)
1 parent a19e985 commit 9d6e19f

File tree

2 files changed

+22
-0
lines changed

2 files changed

+22
-0
lines changed

parquet-hadoop/src/main/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilter.java

+4
Original file line numberDiff line numberDiff line change
@@ -529,6 +529,10 @@ private <T extends Comparable<T>, U extends UserDefinedPredicate<T>> Boolean vis
529529
return BLOCK_MIGHT_MATCH;
530530
}
531531

532+
if (udp.acceptsNullValue()) {
533+
return BLOCK_MIGHT_MATCH;
534+
}
535+
532536
try {
533537
Set<T> dictSet = expandDictionary(meta);
534538
if (dictSet == null) {

parquet-hadoop/src/test/java/org/apache/parquet/filter2/dictionarylevel/DictionaryFilterTest.java

+18
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ public class DictionaryFilterTest {
106106
+ "required binary binary_field; "
107107
+ "required binary single_value_field; "
108108
+ "optional binary optional_single_value_field; "
109+
+ "optional int32 optional_single_value_int32_field;"
109110
+ "required fixed_len_byte_array(17) fixed_field (DECIMAL(40,4)); "
110111
+ "required int32 int32_field; "
111112
+ "required int64 int64_field; "
@@ -194,6 +195,7 @@ private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer)
194195
// 10% of the time, leave the field null
195196
if (index % 10 > 0) {
196197
group.append("optional_single_value_field", "sharp");
198+
group.append("optional_single_value_int32_field", 42);
197199
}
198200

199201
writer.write(group);
@@ -290,6 +292,7 @@ private void testDictionaryEncodedColumnsV1() throws Exception {
290292
"binary_field",
291293
"single_value_field",
292294
"optional_single_value_field",
295+
"optional_single_value_int32_field",
293296
"int32_field",
294297
"int64_field",
295298
"double_field",
@@ -327,6 +330,7 @@ private void testDictionaryEncodedColumnsV2() throws Exception {
327330
"binary_field",
328331
"single_value_field",
329332
"optional_single_value_field",
333+
"optional_single_value_int32_field",
330334
"fixed_field",
331335
"int32_field",
332336
"int64_field",
@@ -670,6 +674,20 @@ public void testUdp() throws Exception {
670674
canDrop(userDefined(intColumn("int32_field"), undroppable), ccmd, dictionaries));
671675
}
672676

677+
@Test
678+
public void testNullAcceptingUdp() throws Exception {
679+
InInt32UDP drop42DenyNulls = new InInt32UDP(Sets.newHashSet(205));
680+
InInt32UDP drop42AcceptNulls = new InInt32UDP(Sets.newHashSet(null, 205));
681+
682+
// A column with value 42 and 10% nulls
683+
IntColumn intColumnWithNulls = intColumn("optional_single_value_int32_field");
684+
685+
assertTrue("Should drop block", canDrop(userDefined(intColumnWithNulls, drop42DenyNulls), ccmd, dictionaries));
686+
assertFalse(
687+
"Should not drop block for null accepting udp",
688+
canDrop(userDefined(intColumnWithNulls, drop42AcceptNulls), ccmd, dictionaries));
689+
}
690+
673691
@Test
674692
public void testInverseUdp() throws Exception {
675693
InInt32UDP droppable = new InInt32UDP(ImmutableSet.of(42));

0 commit comments

Comments
 (0)