Skip to content

Commit e962bc4

Browse files
authored
AVRO-3983: Allow setting a custom encoder in DataFileWriter (#2874)
1 parent 76991e9 commit e962bc4

File tree

2 files changed

+45
-16
lines changed

2 files changed

+45
-16
lines changed

lang/java/avro/src/main/java/org/apache/avro/file/DataFileWriter.java

+15-1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import java.util.HashMap;
3535
import java.util.Map;
3636
import java.util.UUID;
37+
import java.util.function.Function;
3738

3839
import org.apache.avro.AvroRuntimeException;
3940
import org.apache.avro.Schema;
@@ -73,6 +74,8 @@ public class DataFileWriter<D> implements Closeable, Flushable {
7374

7475
private byte[] sync; // 16 random bytes
7576
private int syncInterval = DataFileConstants.DEFAULT_SYNC_INTERVAL;
77+
private Function<OutputStream, BinaryEncoder> initEncoder = out -> new EncoderFactory().directBinaryEncoder(out,
78+
null);
7679

7780
private boolean isOpen;
7881
private Codec codec;
@@ -130,6 +133,17 @@ public DataFileWriter<D> setSyncInterval(int syncInterval) {
130133
return this;
131134
}
132135

136+
/**
137+
* Allows setting a different encoder than the default DirectBinaryEncoder.
138+
*
139+
* @param initEncoderFunc Function to create a binary encoder
140+
* @return this DataFileWriter
141+
*/
142+
public DataFileWriter<D> setEncoder(Function<OutputStream, BinaryEncoder> initEncoderFunc) {
143+
this.initEncoder = initEncoderFunc;
144+
return this;
145+
}
146+
133147
/** Open a new file for data matching a schema with a random sync. */
134148
public DataFileWriter<D> create(Schema schema, File file) throws IOException {
135149
SyncableFileOutputStream sfos = new SyncableFileOutputStream(file);
@@ -242,7 +256,7 @@ private void init(OutputStream outs) throws IOException {
242256
this.vout = efactory.directBinaryEncoder(out, null);
243257
dout.setSchema(schema);
244258
buffer = new NonCopyingByteArrayOutputStream(Math.min((int) (syncInterval * 1.25), Integer.MAX_VALUE / 2 - 1));
245-
this.bufOut = efactory.directBinaryEncoder(buffer, null);
259+
this.bufOut = this.initEncoder.apply(buffer);
246260
if (this.codec == null) {
247261
this.codec = CodecFactory.nullCodec().createInstance();
248262
}

lang/java/avro/src/test/java/org/apache/avro/TestDataFile.java

+30-15
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@
2525
import java.io.ByteArrayOutputStream;
2626
import java.io.File;
2727
import java.io.IOException;
28+
import java.io.OutputStream;
2829
import java.util.ArrayList;
2930
import java.util.List;
3031
import java.util.Random;
32+
import java.util.function.Function;
3133
import java.util.stream.Stream;
3234

3335
import org.apache.avro.file.CodecFactory;
@@ -40,7 +42,9 @@
4042
import org.apache.avro.generic.GenericData;
4143
import org.apache.avro.generic.GenericDatumReader;
4244
import org.apache.avro.generic.GenericDatumWriter;
45+
import org.apache.avro.io.BinaryEncoder;
4346
import org.apache.avro.io.DatumReader;
47+
import org.apache.avro.io.EncoderFactory;
4448
import org.apache.avro.util.RandomData;
4549

4650
import org.junit.jupiter.api.Test;
@@ -93,22 +97,32 @@ private File makeFile(CodecFactory codec) {
9397
@ParameterizedTest
9498
@MethodSource("codecs")
9599
public void runTestsInOrder(CodecFactory codec) throws Exception {
96-
LOG.info("Running with codec: " + codec);
97-
testGenericWrite(codec);
98-
testGenericRead(codec);
99-
testSplits(codec);
100-
testSyncDiscovery(codec);
101-
testGenericAppend(codec);
102-
testReadWithHeader(codec);
103-
testFSync(codec, false);
104-
testFSync(codec, true);
100+
// Run for both encoders, but the MethodSource didn't really like it,
101+
// so it is just a loop within the test
102+
List<Function<OutputStream, BinaryEncoder>> encoders = new ArrayList<>();
103+
encoders.add(b -> new EncoderFactory().directBinaryEncoder(b, null));
104+
encoders.add(b -> new EncoderFactory().blockingDirectBinaryEncoder(b, null));
105+
106+
for (Function<OutputStream, BinaryEncoder> encoder : encoders) {
107+
LOG.info("Running with codec: {}", codec);
108+
testGenericWrite(codec, encoder);
109+
testGenericRead(codec);
110+
testSplits(codec);
111+
testSyncDiscovery(codec);
112+
testGenericAppend(codec, encoder);
113+
testReadWithHeader(codec);
114+
testFSync(codec, encoder, false);
115+
testFSync(codec, encoder, true);
116+
}
105117
}
106118

107-
private void testGenericWrite(CodecFactory codec) throws IOException {
119+
private void testGenericWrite(CodecFactory codec, Function<OutputStream, BinaryEncoder> encoderFunc)
120+
throws IOException {
108121
DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>()).setSyncInterval(100);
109122
if (codec != null) {
110123
writer.setCodec(codec);
111124
}
125+
writer.setEncoder(encoderFunc);
112126
writer.create(SCHEMA, makeFile(codec));
113127
try {
114128
int count = 0;
@@ -210,10 +224,12 @@ private void testSyncDiscovery(CodecFactory codec) throws IOException {
210224
}
211225
}
212226

213-
private void testGenericAppend(CodecFactory codec) throws IOException {
227+
private void testGenericAppend(CodecFactory codec, Function<OutputStream, BinaryEncoder> encoderFunc)
228+
throws IOException {
214229
File file = makeFile(codec);
215230
long start = file.length();
216231
try (DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>()).appendTo(file)) {
232+
writer.setEncoder(encoderFunc);
217233
for (Object datum : new RandomData(SCHEMA, COUNT, SEED + 1)) {
218234
writer.append(datum);
219235
}
@@ -254,11 +270,8 @@ private void testReadWithHeader(CodecFactory codec) throws IOException {
254270
assertEquals(validPos, sin.tell(), "Should not move from sync point on reopen");
255271
assertNotNull(readerFalse.next(), "Should be able to reopen at sync point");
256272
}
257-
258273
}
259-
260274
}
261-
262275
}
263276

264277
@Test
@@ -306,8 +319,10 @@ public void flushCount() throws IOException {
306319
assertTrue(out.flushCount < currentCount && out.flushCount >= flushCounter);
307320
}
308321

309-
private void testFSync(CodecFactory codec, boolean useFile) throws IOException {
322+
private void testFSync(CodecFactory codec, Function<OutputStream, BinaryEncoder> encoderFunc, boolean useFile)
323+
throws IOException {
310324
try (DataFileWriter<Object> writer = new DataFileWriter<>(new GenericDatumWriter<>())) {
325+
writer.setEncoder(encoderFunc);
311326
writer.setFlushOnEveryBlock(false);
312327
TestingByteArrayOutputStream out = new TestingByteArrayOutputStream();
313328
if (useFile) {

0 commit comments

Comments
 (0)