Skip to content

Commit c6729bb

Browse files
authored
Wire up CRAM 3.1 codecs for reading. (#1736)
* Wire up CRAM 3.1 codecs for reading!
1 parent 6332941 commit c6729bb

File tree

48 files changed

+960
-228
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+960
-228
lines changed

src/main/java/htsjdk/beta/codecs/hapref/fasta/FASTACodecV1_0.java

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@
1212
import htsjdk.io.IOPath;
1313
import htsjdk.beta.plugin.HtsVersion;
1414
import htsjdk.beta.plugin.hapref.HaploidReferenceFormats;
15+
import htsjdk.samtools.util.BlockCompressedStreamConstants;
1516
import htsjdk.samtools.util.FileExtensions;
17+
import htsjdk.samtools.util.IOUtil;
1618
import htsjdk.utils.ValidationUtils;
1719

20+
import java.io.IOException;
21+
import java.io.InputStream;
22+
import java.util.zip.GZIPInputStream;
23+
1824
/**
1925
* The v1.0 FASTA codec.
2026
*/
@@ -32,20 +38,26 @@ public String getFileFormat() {
3238
}
3339

3440
@Override
35-
public int getSignatureLength() {
36-
return 1;
37-
}
41+
public int getSignatureLength() { return BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE; }
3842

3943
@Override
4044
public boolean canDecodeSignature(final SignatureStream signatureStream, final String sourceName) {
41-
ValidationUtils.nonNull(signatureStream, "signatureStream");
45+
ValidationUtils.nonNull(signatureStream, "input signatureStream may notbe null");
4246
ValidationUtils.nonNull(sourceName, "sourceName");
43-
int ch = signatureStream.read();
44-
if (ch == -1) {
45-
throw new HtsjdkIOException(
46-
String.format("Codec %s failed probing signature for resource %s", this.getDisplayName(), sourceName));
47+
48+
try {
49+
final InputStream wrappedInputStream = IOUtil.isGZIPInputStream(signatureStream) ?
50+
new GZIPInputStream(signatureStream) :
51+
signatureStream;
52+
int ch = wrappedInputStream.read();
53+
if (ch == -1) {
54+
throw new HtsjdkIOException(
55+
String.format("Codec %s failed probing signature for resource %s", this.getDisplayName(), sourceName));
56+
}
57+
return ((char) ch) == '>'; // for FASTA, this is all we have to go on...
58+
} catch (IOException e) {
59+
throw new HtsjdkIOException(String.format("Failure reading signature from stream for %s", sourceName), e);
4760
}
48-
return ((char) ch) == '>'; // for FASTA, this is all we have to go on...
4961
}
5062

5163
@Override
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
package htsjdk.beta.codecs.reads.cram.cramV3_1;
2+
3+
import htsjdk.beta.codecs.reads.cram.CRAMCodec;
4+
import htsjdk.beta.codecs.reads.cram.CRAMDecoder;
5+
import htsjdk.beta.codecs.reads.cram.CRAMEncoder;
6+
import htsjdk.beta.exception.HtsjdkIOException;
7+
import htsjdk.beta.io.bundle.Bundle;
8+
import htsjdk.beta.io.bundle.SignatureStream;
9+
import htsjdk.beta.plugin.HtsVersion;
10+
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
11+
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
12+
import htsjdk.samtools.cram.structure.CramHeader;
13+
14+
import java.io.IOException;
15+
import java.util.Arrays;
16+
17+
/**
18+
* CRAM v3.1 codec
19+
*/
20+
public class CRAMCodecV3_1 extends CRAMCodec {
21+
public static final HtsVersion VERSION_3_1 = new HtsVersion(3, 1, 0);
22+
private static final String CRAM_MAGIC_3_1 = new String(CramHeader.MAGIC) + "\3\1";
23+
24+
@Override
25+
public HtsVersion getVersion() {
26+
return VERSION_3_1;
27+
}
28+
29+
@Override
30+
public int getSignatureLength() {
31+
return CRAM_MAGIC_3_1.length();
32+
}
33+
34+
@Override
35+
public boolean canDecodeSignature(final SignatureStream signatureStream, final String sourceName) {
36+
try {
37+
final byte[] signatureBytes = new byte[getSignatureLength()];
38+
final int numRead = signatureStream.read(signatureBytes);
39+
if (numRead < getSignatureLength()) {
40+
throw new HtsjdkIOException(String.format("Failure reading content from stream for %s", sourceName));
41+
}
42+
return Arrays.equals(signatureBytes, getSignatureString().getBytes());
43+
} catch (IOException e) {
44+
throw new HtsjdkIOException(String.format("Failure reading content from stream for %s", sourceName));
45+
}
46+
}
47+
48+
@Override
49+
public CRAMDecoder getDecoder(final Bundle inputBundle, final ReadsDecoderOptions readsDecoderOptions) {
50+
return new CRAMDecoderV3_1(inputBundle, readsDecoderOptions);
51+
}
52+
53+
@Override
54+
public CRAMEncoder getEncoder(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
55+
return new CRAMEncoderV3_1(outputBundle, readsEncoderOptions);
56+
}
57+
58+
@Override
59+
protected String getSignatureString() { return CRAM_MAGIC_3_1; }
60+
61+
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
package htsjdk.beta.codecs.reads.cram.cramV3_1;
2+
3+
import htsjdk.beta.codecs.reads.cram.CRAMDecoder;
4+
import htsjdk.beta.io.bundle.Bundle;
5+
import htsjdk.beta.io.bundle.BundleResourceType;
6+
import htsjdk.beta.plugin.HtsVersion;
7+
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
8+
9+
/**
10+
* CRAM v3.1 decoder.
11+
*/
12+
public class CRAMDecoderV3_1 extends CRAMDecoder {
13+
14+
/**
15+
* Create a new CRAM v3.1 decoder. The primary resource in the input
16+
* bundle must have content type {@link BundleResourceType#CT_ALIGNED_READS} (to find a decoder for a bundle,
17+
* see {@link htsjdk.beta.plugin.registry.ReadsResolver}).
18+
*
19+
* @param bundle input {@link Bundle} to decode
20+
* @param readsDecoderOptions {@link ReadsDecoderOptions} to use
21+
*/
22+
public CRAMDecoderV3_1(final Bundle bundle, final ReadsDecoderOptions readsDecoderOptions) {
23+
super(bundle, readsDecoderOptions);
24+
}
25+
26+
@Override
27+
public HtsVersion getVersion() {
28+
return CRAMCodecV3_1.VERSION_3_1;
29+
}
30+
31+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package htsjdk.beta.codecs.reads.cram.cramV3_1;
2+
3+
import htsjdk.beta.codecs.reads.cram.CRAMEncoder;
4+
import htsjdk.beta.io.bundle.Bundle;
5+
import htsjdk.beta.io.bundle.BundleResourceType;
6+
import htsjdk.beta.plugin.HtsVersion;
7+
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
8+
import htsjdk.samtools.cram.CRAMException;
9+
10+
/**
11+
* CRAM v3.1 encoder.
12+
*/
13+
public class CRAMEncoderV3_1 extends CRAMEncoder {
14+
15+
/**
16+
* Create a new CRAM v3.1 encoder for the given output bundle. The primary resource in the
17+
* bundle must have content type {@link BundleResourceType#CT_ALIGNED_READS} (to find an encoder for a bundle,
18+
* see {@link htsjdk.beta.plugin.registry.ReadsResolver}).
19+
*
20+
* @param outputBundle output {@link Bundle} to encode
21+
* @param readsEncoderOptions {@link ReadsEncoderOptions} to use
22+
*/
23+
public CRAMEncoderV3_1(final Bundle outputBundle, final ReadsEncoderOptions readsEncoderOptions) {
24+
super(outputBundle, readsEncoderOptions);
25+
throw new CRAMException("CRAM v3.1 encoding is not yet supported");
26+
}
27+
28+
@Override
29+
public HtsVersion getVersion() {
30+
return CRAMCodecV3_1.VERSION_3_1;
31+
}
32+
33+
}

src/main/java/htsjdk/beta/plugin/registry/HtsCodecResolver.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,11 @@ private SignatureStream getIOPathSignatureProbingStream(
373373
return bundleResource.getSignatureStream(streamPrefixSize);
374374
}
375375

376-
private List<C> filterByVersion(final List<C> candidateCodecs, final HtsVersion htsVersion) {
376+
/**
377+
* Filter the candidate codecs, returning only those that match the requested version.
378+
* Only called for encoding.
379+
*/
380+
protected List<C> filterByVersion(final List<C> candidateCodecs, final HtsVersion htsVersion) {
377381
ValidationUtils.nonNull(htsVersion, "htsVersion");
378382
if (candidateCodecs.isEmpty()) {
379383
return candidateCodecs;

src/main/java/htsjdk/beta/plugin/registry/HtsDefaultRegistry.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ public class HtsDefaultRegistry {
1717
/**
1818
* statically populate the default registry with any codecs on the classpath
1919
*/
20-
static {ServiceLoader.load(HtsCodec .class).forEach(htsDefaultCodecRegistry::registerCodec);}
20+
static {ServiceLoader.load(HtsCodec.class).forEach(htsDefaultCodecRegistry::registerCodec);}
2121

2222
/**
2323
* Grt the {@link HaploidReferenceResolver} resolver for this registry.

src/main/java/htsjdk/beta/plugin/registry/ReadsResolver.java

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package htsjdk.beta.plugin.registry;
22

3+
import htsjdk.beta.codecs.reads.cram.cramV3_1.CRAMCodecV3_1;
34
import htsjdk.beta.exception.HtsjdkException;
45
import htsjdk.beta.exception.HtsjdkPluginException;
56
import htsjdk.beta.plugin.HtsVersion;
@@ -11,9 +12,13 @@
1112
import htsjdk.beta.plugin.reads.ReadsDecoderOptions;
1213
import htsjdk.beta.plugin.reads.ReadsEncoder;
1314
import htsjdk.beta.plugin.reads.ReadsEncoderOptions;
15+
import htsjdk.beta.plugin.reads.ReadsFormats;
1416
import htsjdk.io.IOPath;
1517
import htsjdk.utils.ValidationUtils;
1618

19+
import java.util.List;
20+
import java.util.stream.Collectors;
21+
1722
/**
1823
* Class with methods for resolving inputs and outputs to reads encoders and decoders.
1924
* <p>
@@ -204,6 +209,30 @@ public ReadsEncoder getReadsEncoder(
204209
.getEncoder(outputBundle, readsEncoderOptions);
205210
}
206211

212+
/**
213+
* Temporarily override to remove the CRAM 3.1 codec from the list of candidate codecs when the request is for
214+
* the newest version, since it has no write implementation yet.
215+
*/
216+
@Override
217+
protected List<ReadsCodec> filterByVersion(final List<ReadsCodec> candidateCodecs, final HtsVersion htsVersion) {
218+
final List<ReadsCodec> preFilteredCodecs;
219+
if (htsVersion.equals(HtsVersion.NEWEST_VERSION)) {
220+
// if the request is for the newest version, then pre-filter out the CRAM 3.1 codec since it has no
221+
// write implementation yet, and then delegate to the superclass to let it find the newest version among
222+
// the remaining codecs
223+
preFilteredCodecs = candidateCodecs.stream().filter(
224+
c -> !(c.getFileFormat().equals(ReadsFormats.CRAM)
225+
&& c.getVersion().equals(CRAMCodecV3_1.VERSION_3_1)))
226+
.collect(Collectors.toList());
227+
final HtsVersion newestVersion = preFilteredCodecs.stream()
228+
.map(c -> c.getVersion())
229+
.reduce(candidateCodecs.get(0).getVersion(),
230+
(HtsVersion a, HtsVersion b) -> a.compareTo(b) > 0 ? a : b);
231+
return candidateCodecs.stream().filter(
232+
c -> c.getVersion().equals(newestVersion)).collect(Collectors.toList());
233+
} else {
234+
preFilteredCodecs = candidateCodecs;
235+
}
236+
return super.filterByVersion(preFilteredCodecs, htsVersion);
237+
}
207238
}
208-
209-

src/main/java/htsjdk/samtools/cram/common/CramVersions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@
66
public final class CramVersions {
77
public static final CRAMVersion CRAM_v2_1 = new CRAMVersion(2, 1);
88
public static final CRAMVersion CRAM_v3 = new CRAMVersion(3, 0);
9+
public static final CRAMVersion CRAM_v3_1 = new CRAMVersion(3, 1);
910

1011
final static Set<CRAMVersion> supportedCRAMVersions = new HashSet<CRAMVersion>() {{
1112
add(CRAM_v2_1);
1213
add(CRAM_v3);
14+
add(CRAM_v3_1);
1315
}};
1416

1517
/**

src/main/java/htsjdk/samtools/cram/compression/BZIP2ExternalCompressor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
package htsjdk.samtools.cram.compression;
2626

2727
import htsjdk.samtools.cram.io.InputStreamUtils;
28+
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
2829
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
2930
import htsjdk.samtools.util.IOUtil;
3031
import htsjdk.samtools.util.RuntimeIOException;
@@ -42,7 +43,7 @@ public BZIP2ExternalCompressor() {
4243
}
4344

4445
@Override
45-
public byte[] compress(final byte[] data) {
46+
public byte[] compress(final byte[] data, final CRAMCodecModelContext unused_contextModel) {
4647
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
4748
try (final BZip2CompressorOutputStream bos = new BZip2CompressorOutputStream(byteArrayOutputStream)) {
4849
IOUtil.copyStream(new ByteArrayInputStream(data), bos);

src/main/java/htsjdk/samtools/cram/compression/CompressionUtils.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,4 +175,22 @@ public static ByteBuffer wrap(final byte[] inputBytes){
175175
public static ByteBuffer slice(final ByteBuffer inputBuffer){
176176
return inputBuffer.slice().order(ByteOrder.LITTLE_ENDIAN);
177177
}
178+
179+
/**
180+
* Return a byte array with a size that matches the limit of the provided ByteBuffer. If the ByteBuffer is
181+
* backed by a byte array that matches the limit of the ByteBuffer, the backing array will be returned directly.
182+
* Otherwise, copy the contents of the ByteBuffer into a new byte array and return the new byte array.
183+
* @param buffer input ByteBuffer which is the source of the byte array
184+
* @return A byte array. If the ByteBuffer is backed by a byte array that matches the limit of the ByteBuffer,
185+
* return the backing array directly. Otherwise, copy the contents of the ByteBuffer into a new byte array.
186+
*/
187+
public static byte[] toByteArray(final ByteBuffer buffer) {
188+
if (buffer.hasArray() && buffer.arrayOffset() == 0 && buffer.array().length == buffer.limit()) {
189+
return buffer.array();
190+
}
191+
192+
final byte[] bytes = new byte[buffer.limit() - buffer.arrayOffset()];
193+
buffer.get(bytes);
194+
return bytes;
195+
}
178196
}

src/main/java/htsjdk/samtools/cram/compression/ExternalCompressor.java

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
package htsjdk.samtools.cram.compression;
22

3+
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompDecode;
4+
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompEncode;
5+
import htsjdk.samtools.cram.compression.fqzcomp.FQZCompExternalCompressor;
6+
import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationDecode;
7+
import htsjdk.samtools.cram.compression.nametokenisation.NameTokenisationEncode;
8+
import htsjdk.samtools.cram.compression.nametokenisation.NameTokeniserExternalCompressor;
39
import htsjdk.samtools.cram.compression.range.RangeDecode;
410
import htsjdk.samtools.cram.compression.range.RangeEncode;
11+
import htsjdk.samtools.cram.compression.range.RangeExternalCompressor;
512
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Decode;
613
import htsjdk.samtools.cram.compression.rans.rans4x8.RANS4x8Encode;
14+
import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Decode;
15+
import htsjdk.samtools.cram.compression.rans.ransnx16.RANSNx16Encode;
16+
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
717
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
818
import htsjdk.utils.ValidationUtils;
919

@@ -17,7 +27,13 @@ protected ExternalCompressor(final BlockCompressionMethod method) {
1727
this.method = method;
1828
}
1929

20-
public abstract byte[] compress(byte[] data);
30+
/**
31+
* Compress the data using the codec-specific context model.
32+
* @param data the data to compress
33+
* @param contextModel the context model to use for compression; may be null
34+
* @return the compressed data
35+
*/
36+
public abstract byte[] compress(byte[] data, CRAMCodecModelContext contextModel);
2137

2238
public abstract byte[] uncompress(byte[] data);
2339

@@ -74,14 +90,25 @@ public static ExternalCompressor getCompressorForMethod(
7490

7591
case RANS:
7692
return compressorSpecificArg == NO_COMPRESSION_ARG ?
77-
new RANSExternalCompressor(new RANS4x8Encode(), new RANS4x8Decode()) :
78-
new RANSExternalCompressor(compressorSpecificArg, new RANS4x8Encode(), new RANS4x8Decode());
93+
new RANS4x8ExternalCompressor(new RANS4x8Encode(), new RANS4x8Decode()) :
94+
new RANS4x8ExternalCompressor(compressorSpecificArg, new RANS4x8Encode(), new RANS4x8Decode());
95+
96+
case RANSNx16:
97+
return compressorSpecificArg == NO_COMPRESSION_ARG ?
98+
new RANSNx16ExternalCompressor(new RANSNx16Encode(), new RANSNx16Decode()) :
99+
new RANSNx16ExternalCompressor(compressorSpecificArg, new RANSNx16Encode(), new RANSNx16Decode());
79100

80-
case RANGE:
101+
case ADAPTIVE_ARITHMETIC:
81102
return compressorSpecificArg == NO_COMPRESSION_ARG ?
82103
new RangeExternalCompressor(new RangeEncode(), new RangeDecode()) :
83104
new RangeExternalCompressor(compressorSpecificArg, new RangeEncode(), new RangeDecode());
84105

106+
case NAME_TOKENISER:
107+
return new NameTokeniserExternalCompressor(new NameTokenisationEncode(), new NameTokenisationDecode());
108+
109+
case FQZCOMP:
110+
return new FQZCompExternalCompressor(new FQZCompEncode(), new FQZCompDecode());
111+
85112
case BZIP2:
86113
ValidationUtils.validateArg(
87114
compressorSpecificArg == NO_COMPRESSION_ARG,

src/main/java/htsjdk/samtools/cram/compression/GZIPExternalCompressor.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import htsjdk.samtools.Defaults;
2828
import htsjdk.samtools.cram.io.InputStreamUtils;
29+
import htsjdk.samtools.cram.structure.CRAMCodecModelContext;
2930
import htsjdk.samtools.cram.structure.block.BlockCompressionMethod;
3031
import htsjdk.samtools.util.IOUtil;
3132
import htsjdk.samtools.util.RuntimeIOException;
@@ -62,7 +63,7 @@ public GZIPExternalCompressor(final int compressionLevel) {
6263
public int getWriteCompressionLevel() { return writeCompressionLevel; }
6364

6465
@Override
65-
public byte[] compress(final byte[] data) {
66+
public byte[] compress(final byte[] data, final CRAMCodecModelContext unused_contextModel) {
6667
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
6768
try (final GZIPOutputStream gos = new GZIPOutputStream(byteArrayOutputStream) {
6869
{

0 commit comments

Comments
 (0)