-
Notifications
You must be signed in to change notification settings - Fork 242
feat: Add sanity check to indexed FASTA file #1745
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
dcd8b41
c2a6a6b
e75f493
f12890e
b6a1842
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ | |
import java.nio.file.Path; | ||
import java.util.Iterator; | ||
|
||
|
||
/** | ||
* @author Daniel Gomez-Sanchez (magicDGS) | ||
*/ | ||
|
@@ -146,6 +147,55 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile, | |
} | ||
} | ||
|
||
/** Do some basic checking to make sure the fasta and the index match. | ||
* <p> | ||
* checks that the length of the fasta file is at least as long as the index proclaims | ||
* and that beyond the last position references in the index there is only one line followed by whitespaces | ||
* | ||
* @param fastaFile Path to fasta file | ||
* @param fastaSequenceIndexes index file to check against the fasta file. | ||
* | ||
* @throws IOException in case of io-error when reading fastaFile | ||
*/ | ||
public void sanityCheckFastaAgainstIndex(final Path fastaFile, | ||
final FastaSequenceIndex fastaSequenceIndexes) throws IOException { | ||
|
||
final FastaSequenceIndexEntry lastSequenceIndex = fastaSequenceIndexes.getIndexEntry(fastaSequenceIndexes.getLastSequence()); | ||
|
||
final long lastSequenceLength = lastSequenceIndex.getSize(); | ||
final long lastSequenceStart = lastSequenceIndex.getLocation(); | ||
final long lastSequenceEnd = lastSequenceStart + lastSequenceIndex.getOffset(lastSequenceLength); | ||
|
||
final long fastaLength = Files.size(fastaFile); | ||
|
||
if (lastSequenceEnd > fastaLength) { | ||
throw new IllegalArgumentException("The fasta file (%s) is shorter (%d) than its index (%s) claims (%d). Please reindex the fasta.".formatted(fastaFile.toUri().toString(),fastaLength, index.toString(), lastSequenceEnd)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IllegalArgument seems like the wrong exception here. I might declare a new |
||
} | ||
// if fasta file is longer than this, make sure that the remainder is just whitespaces | ||
long posOfInterest = lastSequenceEnd + lastSequenceIndex.getTerminatorLength(); | ||
if (posOfInterest < fastaLength) { | ||
|
||
final ByteBuffer channelBuffer = ByteBuffer.allocate(100); | ||
|
||
while (posOfInterest < fastaLength) { | ||
channelBuffer.clear(); | ||
readFromPosition(channelBuffer, posOfInterest); | ||
for (int i = 0; i < channelBuffer.position(); i++) { | ||
byte b = channelBuffer.get(i); | ||
if (!Character.isWhitespace((char) b)) { | ||
throw new IllegalArgumentException( | ||
("The fasta file %s is too long (relative to the index). In particular has a non-whitespace " + | ||
"character (%c) at a position (%d) beyond the last base (%d), according to its index (%s)." + | ||
" Please reindex the fasta.") | ||
.formatted(fastaFile.toUri().toString(), (char) b, posOfInterest + i, lastSequenceEnd,index.toString())); | ||
} | ||
} | ||
posOfInterest += channelBuffer.limit(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there might be minor bug here. When you clear the buffer it should set limit = capacity, but I don't think readFromPosition guarantees that the as many bytes as possible are read, it's possible for it to return fewer than requested. I think this might have the effect of skipping parts of the fasta file on incomplete reads. Setting it with position or capturing the return value from readFromPosition would both do the right thing I think. Unlikely to be a real issue since the bug would be to sometimes fail to detect a mismatch if there are characters embedded partway through a block of white space. |
||
} | ||
} | ||
} | ||
|
||
|
||
public FastaSequenceIndex getIndex() { | ||
return index; | ||
} | ||
|
@@ -210,11 +260,12 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop | |
final int bytesPerLine = indexEntry.getBytesPerLine(); | ||
final int terminatorLength = bytesPerLine - basesPerLine; | ||
|
||
long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine; | ||
long startOffset = indexEntry.getOffset(start); | ||
|
||
// Cast to long so the second argument cannot overflow a signed integer. | ||
final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine); | ||
if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " + minBufferSize); | ||
|
||
// Allocate a buffer for reading in sequence data. | ||
final ByteBuffer channelBuffer = ByteBuffer.allocate((int)minBufferSize); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -36,11 +36,7 @@ | |
import java.io.PrintStream; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.Iterator; | ||
import java.util.LinkedHashMap; | ||
import java.util.Map; | ||
import java.util.Objects; | ||
import java.util.Scanner; | ||
import java.util.*; | ||
import java.util.regex.MatchResult; | ||
|
||
/** | ||
|
@@ -50,26 +46,31 @@ public class FastaSequenceIndex implements Iterable<FastaSequenceIndexEntry> { | |
/** | ||
* Store the entries. Use a LinkedHashMap for consistent iteration in insertion order. | ||
*/ | ||
private final Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>(); | ||
private final Map<String, FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<>(); | ||
|
||
/** this member contains the name of the last index entry that was created during initialization. | ||
* Afterwards, subsequent operations (such as rename) might change the names and thus this member becomes unreliable. | ||
*/ | ||
private final String lastSequence; | ||
|
||
/** | ||
* Build a sequence index from the specified file. | ||
* @param indexFile File to open. | ||
* @throws FileNotFoundException if the index file cannot be found. | ||
* @throws SAMException if the index file cannot be found. | ||
*/ | ||
public FastaSequenceIndex( File indexFile ) { | ||
public FastaSequenceIndex( File indexFile ) throws SAMException { | ||
this(IOUtil.toPath(indexFile)); | ||
} | ||
|
||
/** | ||
* Build a sequence index from the specified file. | ||
* @param indexFile File to open. | ||
* @throws FileNotFoundException if the index file cannot be found. | ||
* @throws SAMException if the index file cannot be found. | ||
*/ | ||
public FastaSequenceIndex( Path indexFile ) { | ||
public FastaSequenceIndex( Path indexFile ) throws SAMException { | ||
IOUtil.assertFileIsReadable(indexFile); | ||
try (InputStream in = Files.newInputStream(indexFile)) { | ||
parseIndexFile(in); | ||
this.lastSequence = parseIndexFile(in); | ||
} catch (IOException e) { | ||
throw new SAMException("Fasta index file could not be opened: " + indexFile, e); | ||
} | ||
|
@@ -80,13 +81,15 @@ public FastaSequenceIndex( Path indexFile ) { | |
* @param in InputStream to read from. | ||
*/ | ||
public FastaSequenceIndex(InputStream in) { | ||
parseIndexFile(in); | ||
lastSequence=parseIndexFile(in); | ||
} | ||
|
||
/** | ||
* Empty, protected constructor for unit testing. | ||
* Empty, protected constructor for unit testing. Use with care, lastSequence will be incorrect. | ||
*/ | ||
protected FastaSequenceIndex() {} | ||
protected FastaSequenceIndex() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this is really only used for unit testing. |
||
lastSequence = ""; | ||
} | ||
|
||
/** | ||
* Add a new index entry to the list. Protected for unit testing. | ||
|
@@ -144,10 +147,13 @@ public int hashCode() { | |
/** | ||
* Parse the contents of an index file, caching the results internally. | ||
* @param in InputStream to parse. | ||
* | ||
* @return the name of the last contig (for the sanity-check) | ||
*/ | ||
private void parseIndexFile(InputStream in) { | ||
private String parseIndexFile(InputStream in) { | ||
try (Scanner scanner = new Scanner(in)) { | ||
int sequenceIndex = 0; | ||
String lastContig = null; | ||
while( scanner.hasNext() ) { | ||
// Tokenize and validate the index line. | ||
String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)"); | ||
|
@@ -170,7 +176,9 @@ private void parseIndexFile(InputStream in) { | |
contig = SAMSequenceRecord.truncateSequenceName(contig); | ||
// Build sequence structure | ||
add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) ); | ||
lastContig=contig; | ||
} | ||
return lastContig; | ||
} | ||
} | ||
|
||
|
@@ -233,4 +241,13 @@ public Iterator<FastaSequenceIndexEntry> iterator() { | |
public int size() { | ||
return sequenceEntries.size(); | ||
} | ||
|
||
|
||
/** | ||
* @return The name of the last entry that was added when parsing the index file. Only guarranteed to be correct just | ||
* after initialization. Protected for access from AbstractIndexedFastaSequenceFile. | ||
*/ | ||
protected String getLastSequence() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't love this with the caveats but it's fine if it's clearly labelled. |
||
return this.lastSequence; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -112,6 +112,26 @@ public int getSequenceIndex() { | |
return sequenceIndex; | ||
} | ||
|
||
/** Return the offset to pos as determined by the number of bases and bytes per line | ||
* | ||
* @param pos the (1-based) position in the contig that is requested | ||
* @return the offset (0-based) from 'location' where pos is located in the file. | ||
*/ | ||
public long getOffset(long pos) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
final int basesPerLine = this.getBasesPerLine(); | ||
final int bytesPerLine = this.getBytesPerLine(); | ||
|
||
return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine); | ||
} | ||
|
||
/** get the terminator length from the bytes per line and the bases per line | ||
* | ||
* @return the length of the terminator of each line (1 or 2) | ||
*/ | ||
public int getTerminatorLength() { | ||
return bytesPerLine - basesPerLine; | ||
} | ||
|
||
/** | ||
* For debugging. Emit the contents of each contig line. | ||
* @return A string representation of the contig line. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
>a test white space | ||
ACTG | ||
>b test whitespace | ||
ACTG | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should say what it throws if it detects and error.