Skip to content

Commit 7034b33

Browse files
cmnbroadlbergelson
andauthored
Support reference bundles. (#1713)
* Add support for creating ReferenceSequenceFile from a reference Bundle. This allows using a reference that has supporting files in non-standard locations by specifying them individually in the bundle. --------- Co-authored-by: Louis Bergelson <[email protected]>
1 parent 4cc0100 commit 7034b33

16 files changed

+688
-41
lines changed

src/main/java/htsjdk/beta/codecs/hapref/fasta/FASTADecoderV1_0.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@ public FASTADecoderV1_0(final Bundle inputBundle) {
3434
this.displayName = inputBundle.getPrimaryResource().getDisplayName();
3535
final BundleResource referenceResource = inputBundle.getOrThrow(BundleResourceType.CT_HAPLOID_REFERENCE);
3636
if (referenceResource.getIOPath().isPresent()) {
37-
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(
38-
referenceResource.getIOPath().get().toPath(), true);
37+
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFileFromBundle(inputBundle, true, true);
3938
} else {
4039
final SeekableStream seekableStream = referenceResource.getSeekableStream().orElseThrow(
4140
() -> new IllegalArgumentException(

src/main/java/htsjdk/beta/io/bundle/BundleResourceType.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ public class BundleResourceType {
5656
/** Secondary content types for {@link BundleResourceType#CT_HAPLOID_REFERENCE} resources*/
5757
public static final String CT_REFERENCE_DICTIONARY = "REFERENCE_DICTIONARY";
5858
public static final String CT_REFERENCE_INDEX = "REFERENCE_INDEX";
59+
public static final String CT_REFERENCE_INDEX_GZI = "REFERENCE_INDEX_GZI";
5960

6061

6162
/****************************************** Resource types for FEATURES ********************************/

src/main/java/htsjdk/beta/plugin/registry/HaploidReferenceResolver.java

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,18 @@
99
import htsjdk.beta.plugin.hapref.HaploidReferenceCodec;
1010
import htsjdk.beta.plugin.hapref.HaploidReferenceDecoder;
1111
import htsjdk.beta.plugin.hapref.HaploidReferenceDecoderOptions;
12+
import htsjdk.io.HtsPath;
1213
import htsjdk.io.IOPath;
14+
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
15+
import htsjdk.samtools.util.GZIIndex;
16+
import htsjdk.samtools.util.IOUtil;
1317
import htsjdk.utils.ValidationUtils;
1418

19+
import java.io.IOException;
20+
import java.nio.file.Files;
21+
import java.nio.file.Path;
22+
import java.util.function.Function;
23+
1524
/**
1625
* Class with methods for resolving inputs and outputs to haploid reference encoders and decoders.
1726
* <p>
@@ -66,9 +75,7 @@ public HaploidReferenceDecoder getHaploidReferenceDecoder(
6675
ValidationUtils.nonNull(inputPath, "Input path");
6776
ValidationUtils.nonNull(HaploidReferenceDecoderOptions, "Decoder options");
6877

69-
final Bundle referenceBundle = new BundleBuilder().addPrimary(
70-
new IOPathResource(inputPath, BundleResourceType.CT_HAPLOID_REFERENCE)).build();
71-
78+
final Bundle referenceBundle = referenceBundleFromFastaPath(inputPath, HtsPath::new);
7279
return getHaploidReferenceDecoder(referenceBundle, HaploidReferenceDecoderOptions);
7380
}
7481

@@ -110,4 +117,47 @@ public HaploidReferenceDecoder getHaploidReferenceDecoder(
110117
return (HaploidReferenceDecoder) resolveForDecoding(inputBundle).getDecoder(inputBundle, HaploidReferenceDecoderOptions);
111118
}
112119

120+
/**
121+
* Create a reference bundle given only a fasta path, including an index and a dictionary
122+
* file if they are present and located in the same directory as the fasta.
123+
*
124+
* @param fastaPath location of the fasta
125+
* @param ioPathConstructor a constructor used to create IOPath-derived objects for the bundle
126+
* @return a reference Bundle
127+
* @param <T>
128+
*/
129+
public static <T extends IOPath> Bundle referenceBundleFromFastaPath(final IOPath fastaPath, final Function<String, T> ioPathConstructor) {
130+
final BundleBuilder referenceBundleBuilder = new BundleBuilder();
131+
referenceBundleBuilder.addPrimary(new IOPathResource(fastaPath, BundleResourceType.CT_HAPLOID_REFERENCE));
132+
133+
final Path dictPath = ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(fastaPath.toPath());
134+
if (Files.exists(dictPath)) {
135+
referenceBundleBuilder.addSecondary(
136+
new IOPathResource(
137+
ioPathConstructor.apply(dictPath.toUri().toString()),
138+
BundleResourceType.CT_REFERENCE_DICTIONARY));
139+
}
140+
141+
final Path idxPath = ReferenceSequenceFileFactory.getFastaIndexFileName(fastaPath.toPath());
142+
if (Files.exists(idxPath)) {
143+
referenceBundleBuilder.addSecondary(
144+
new IOPathResource(
145+
ioPathConstructor.apply(idxPath.toUri().toString()),
146+
BundleResourceType.CT_REFERENCE_INDEX));
147+
}
148+
149+
try {
150+
if (IOUtil.isBlockCompressed(fastaPath.toPath(), true)) {
151+
final Path gziPath = GZIIndex.resolveIndexNameForBgzipFile(fastaPath.toPath());
152+
referenceBundleBuilder.addSecondary(
153+
new IOPathResource(
154+
ioPathConstructor.apply(gziPath.toUri().toString()),
155+
BundleResourceType.CT_REFERENCE_INDEX_GZI));
156+
}
157+
} catch (IOException e) {
158+
throw new HtsjdkException("Error while checking for block compression", e);
159+
}
160+
return referenceBundleBuilder.build();
161+
}
162+
113163
}

src/main/java/htsjdk/beta/plugin/variants/VariantsBundle.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
/**
1919
* A {@link Bundle} for variants and variants-related resources that are backed by on disk files. A {@link
2020
* htsjdk.beta.plugin.variants.VariantsBundle} has a primary resource with content type {@link
21-
* BundleResourceType#PRIMARY_CT_VARIANT_CONTEXTS}; and an optional index resource. A VariantsBundle can also
21+
* BundleResourceType#CT_VARIANT_CONTEXTS}; and an optional index resource. A VariantsBundle can also
2222
* contain additional resources.
2323
*
2424
* Note that this class is merely a convenience class for the case where the variants are backed by files on disk.
@@ -31,6 +31,7 @@ public class VariantsBundle extends Bundle implements Serializable {
3131
@Serial
3232
private static final long serialVersionUID = 1L;
3333
private static final Log LOG = Log.getInstance(VariantsBundle.class);
34+
3435
/**
3536
* Create a {@link htsjdk.beta.plugin.variants.VariantsBundle} containing only a variants resource.
3637
*

src/main/java/htsjdk/samtools/reference/AbstractFastaSequenceFile.java

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,16 @@
2424

2525
package htsjdk.samtools.reference;
2626

27+
import htsjdk.io.HtsPath;
28+
import htsjdk.io.IOPath;
2729
import htsjdk.samtools.SAMException;
28-
import htsjdk.samtools.SAMFileHeader;
2930
import htsjdk.samtools.SAMSequenceDictionary;
30-
import htsjdk.samtools.SAMTextHeaderCodec;
31-
import htsjdk.samtools.util.BufferedLineReader;
3231
import htsjdk.samtools.util.FileExtensions;
3332
import htsjdk.samtools.util.IOUtil;
3433
import htsjdk.samtools.util.Lazy;
3534

3635
import java.io.File;
36+
import java.io.IOException;
3737
import java.io.InputStream;
3838
import java.io.Serializable;
3939
import java.nio.file.Files;
@@ -84,13 +84,25 @@ abstract class AbstractFastaSequenceFile implements ReferenceSequenceFile {
8484
/** Attempts to find and load the sequence dictionary if present. */
8585
protected SAMSequenceDictionary findAndLoadSequenceDictionary(final Path fasta) {
8686
final Path dictPath = findSequenceDictionary(path);
87-
if (dictPath == null) return null;
87+
if (dictPath == null) {
88+
return null;
89+
}
90+
return loadSequenceDictionary(new HtsPath(dictPath.toUri().toString()));
91+
}
8892

89-
IOUtil.assertFileIsReadable(dictPath);
90-
try (InputStream dictionaryIn = IOUtil.openFileForReading(dictPath)) {
91-
return ReferenceSequenceFileFactory.loadDictionary(dictionaryIn);
93+
/**
94+
* Attempt to load a sequence dictionary given a file path. Path may be null.
95+
* @param dictPath the dictionary file to open
96+
* @return the SAMSequenceDictionary, or null
97+
*/
98+
protected static SAMSequenceDictionary loadSequenceDictionary(final IOPath dictPath) {
99+
if (dictPath == null) {
100+
return null;
92101
}
93-
catch (Exception e) {
102+
IOUtil.assertFileIsReadable(dictPath.toPath());
103+
try (final InputStream dictionaryStream = IOUtil.openFileForReading(dictPath.toPath())) {
104+
return ReferenceSequenceFileFactory.loadDictionary(dictionaryStream);
105+
} catch (final IOException e) {
94106
throw new SAMException("Could not open sequence dictionary file: " + dictPath, e);
95107
}
96108
}

src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
package htsjdk.samtools.reference;
2626

27+
import htsjdk.io.IOPath;
2728
import htsjdk.samtools.Defaults;
2829
import htsjdk.samtools.SAMException;
2930
import htsjdk.samtools.SAMSequenceDictionary;
@@ -69,6 +70,27 @@ protected AbstractIndexedFastaSequenceFile(final Path path, final FastaSequenceI
6970
}
7071
}
7172

73+
/**
74+
* Create a AbstractIndexedFastaSequenceFile from explicitly provided files. No assumptions are made
75+
* about the relative location of the files (i.e., that they are siblings).
76+
*
77+
* @param fastaPath the path to the fasta file. may not be null.
78+
* @param dictPath the path to the sequence dictionary. may be null.
79+
* @param index the associated index object; may not be null.
80+
*/
81+
protected AbstractIndexedFastaSequenceFile(final IOPath fastaPath, final IOPath dictPath, final FastaSequenceIndex index) {
82+
super(fastaPath.toPath(), fastaPath.getURIString(), loadSequenceDictionary(dictPath));
83+
if (index == null) {
84+
throw new IllegalArgumentException("Null index for fasta " + index);
85+
}
86+
this.index = index;
87+
IOUtil.assertFileIsReadable(fastaPath.toPath());
88+
reset();
89+
if (getSequenceDictionary() != null) {
90+
sanityCheckDictionaryAgainstIndex(fastaPath.getRawInputString(), getSequenceDictionary(), index);
91+
}
92+
}
93+
7294
/**
7395
* Initialise the given indexed fasta sequence file stream.
7496
* @param source The named source of the reference file (used in error messages).

src/main/java/htsjdk/samtools/reference/BlockCompressedIndexedFastaSequenceFile.java

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
package htsjdk.samtools.reference;
2626

27+
import htsjdk.io.IOPath;
2728
import htsjdk.samtools.SAMException;
2829
import htsjdk.samtools.SAMSequenceDictionary;
2930
import htsjdk.samtools.seekablestream.SeekablePathStream;
@@ -54,6 +55,32 @@ public BlockCompressedIndexedFastaSequenceFile(final Path path)
5455
this(path, new FastaSequenceIndex((findRequiredFastaIndexFile(path))));
5556
}
5657

58+
/**
59+
* Create a BlockCompressedIndexedFastaSequenceFile from explicitly provided files. No assumptions are made
60+
* about the relative location of the files (i.e., no assumption is made that they are siblings).
61+
* @param fastaPath the fasta file
62+
* @param dictPath the associated dictionary file
63+
* @param index the associated index
64+
* @param gziIndex the associated gziIndex
65+
*/
66+
public BlockCompressedIndexedFastaSequenceFile(
67+
final IOPath fastaPath,
68+
final IOPath dictPath,
69+
final FastaSequenceIndex index,
70+
final GZIIndex gziIndex) {
71+
super(fastaPath, dictPath, index);
72+
if (gziIndex == null) {
73+
throw new IllegalArgumentException("null gzi index");
74+
}
75+
assertIsBlockCompressed(fastaPath.toPath());
76+
try {
77+
stream = new BlockCompressedInputStream(new SeekablePathStream(fastaPath.toPath()));
78+
gzindex = gziIndex;
79+
} catch (IOException e) {
80+
throw new SAMException("Fasta file should be readable but is not: " + fastaPath, e);
81+
}
82+
}
83+
5784
public BlockCompressedIndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index) {
5885
this(path, index, loadFastaGziIndex(path));
5986
}

src/main/java/htsjdk/samtools/reference/FastaSequenceFile.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424

2525
package htsjdk.samtools.reference;
2626

27+
import htsjdk.io.IOPath;
2728
import htsjdk.samtools.Defaults;
2829
import htsjdk.samtools.SAMException;
2930
import htsjdk.samtools.SAMSequenceDictionary;
@@ -64,6 +65,21 @@ public FastaSequenceFile(final Path path, final boolean truncateNamesAtWhitespac
6465
this.in = new FastLineReader(IOUtil.openFileForReading(path));
6566
}
6667

68+
/**
69+
* Constructs a FastaSequenceFile that reads from the specified fasta and dictionary file. Makes no
70+
* assumptions that the fasta and dict file are in the same directory.
71+
*
72+
* @param fastaPath may not be null
73+
* @param dictPath may be null
74+
* @param truncateNamesAtWhitespace
75+
*/
76+
public FastaSequenceFile(final IOPath fastaPath, final IOPath dictPath, final boolean truncateNamesAtWhitespace) {
77+
super(fastaPath.toPath(), fastaPath.toString(), dictPath == null ? null : loadSequenceDictionary(dictPath));
78+
this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
79+
this.seekableStream = null;
80+
this.in = new FastLineReader(IOUtil.openFileForReading(fastaPath.toPath()));
81+
}
82+
6783
/**
6884
* Constructs a FastaSequenceFile that reads from the specified stream (which must not be compressed, i.e.
6985
* the caller is responsible for decompressing the stream).

src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,16 @@
2424

2525
package htsjdk.samtools.reference;
2626

27+
import htsjdk.io.IOPath;
2728
import htsjdk.samtools.SAMException;
2829
import htsjdk.samtools.SAMSequenceDictionary;
2930
import htsjdk.samtools.seekablestream.ReadableSeekableStreamByteChannel;
3031
import htsjdk.samtools.seekablestream.SeekableStream;
31-
import htsjdk.samtools.util.BlockCompressedInputStream;
3232
import htsjdk.samtools.util.IOUtil;
3333

34-
import java.io.BufferedInputStream;
3534
import java.io.File;
3635
import java.io.FileNotFoundException;
3736
import java.io.IOException;
38-
import java.io.InputStream;
3937
import java.nio.ByteBuffer;
4038
import java.nio.channels.FileChannel;
4139
import java.nio.channels.SeekableByteChannel;
@@ -56,7 +54,6 @@ public class IndexedFastaSequenceFile extends AbstractIndexedFastaSequenceFile {
5654
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
5755
* @param file The file to open.
5856
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk.
59-
* @throws FileNotFoundException If the fasta or any of its supporting files cannot be found.
6057
*/
6158
public IndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) {
6259
this(IOUtil.toPath(file), index);
@@ -79,7 +76,7 @@ public IndexedFastaSequenceFile(final File file) throws FileNotFoundException {
7976
public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index) {
8077
super(path, index);
8178
try {
82-
// check if the it is a valid block-compressed file
79+
// check if it is a valid block-compressed file
8380
if (IOUtil.isBlockCompressed(path, true)) {
8481
throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
8582
}
@@ -89,6 +86,26 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index)
8986
}
9087
}
9188

89+
/**
90+
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
91+
*
92+
* @param path The file to open.
93+
* @param dictPath the dictionary path (may be null)
94+
* @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. may not be null.
95+
*/
96+
public IndexedFastaSequenceFile(final IOPath path, final IOPath dictPath, final FastaSequenceIndex index) {
97+
super(path, dictPath, index);
98+
try {
99+
// reject block-compressed files (use BlockCompressedIndexedFastaSequenceFile)
100+
if (IOUtil.isBlockCompressed(path.toPath(), true)) {
101+
throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
102+
}
103+
this.channel = Files.newByteChannel(path.toPath());
104+
} catch (IOException e) {
105+
throw new SAMException("FASTA file should be readable but is not: " + path, e);
106+
}
107+
}
108+
92109
/**
93110
* Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened.
94111
* @param path The file to open.

0 commit comments

Comments
 (0)