samtools · yfarjoun · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
diff --git a/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFile.java
@@ -38,6 +38,7 @@
 import java.nio.file.Path;
 import java.util.Iterator;
 
+
 /**
  * @author Daniel Gomez-Sanchez (magicDGS)
  */
@@ -146,6 +147,55 @@ protected static void sanityCheckDictionaryAgainstIndex(final String fastaFile,
         }
     }
 
+    /** Do some basic checking to make sure the fasta and the index match.
+     * <p>
+     * checks that the length of the fasta file is at least as long as the index proclaims
+     * and that beyond the last position references in the index there is only one line followed by whitespaces
+     *
+     * @param fastaFile Path to fasta file
+     * @param fastaSequenceIndexes index file to check against the fasta file.
+     *
+     * @throws IOException in case of io-error when reading fastaFile
+     */
+    public void sanityCheckFastaAgainstIndex(final Path fastaFile,
+                                             final FastaSequenceIndex fastaSequenceIndexes) throws IOException {
+
+        final FastaSequenceIndexEntry lastSequenceIndex = fastaSequenceIndexes.getIndexEntry(fastaSequenceIndexes.getLastSequence());
+
+        final long lastSequenceLength = lastSequenceIndex.getSize();
+        final long lastSequenceStart = lastSequenceIndex.getLocation();
+        final long lastSequenceEnd = lastSequenceStart + lastSequenceIndex.getOffset(lastSequenceLength);
+
+        final long fastaLength = Files.size(fastaFile);
+
+        if (lastSequenceEnd > fastaLength) {
+            throw new IllegalArgumentException("The fasta file (%s) is shorter (%d) than its index (%s) claims (%d). Please reindex the fasta.".formatted(fastaFile.toUri().toString(),fastaLength, index.toString(), lastSequenceEnd));
+        }
+        // if fasta file is longer than this, make sure that the remainder is just whitespaces
+        long posOfInterest = lastSequenceEnd + lastSequenceIndex.getTerminatorLength();
+        if (posOfInterest < fastaLength) {
+
+            final ByteBuffer channelBuffer = ByteBuffer.allocate(100);
+
+            while (posOfInterest < fastaLength) {
+                channelBuffer.clear();
+                readFromPosition(channelBuffer, posOfInterest);
+                for (int i = 0; i < channelBuffer.position(); i++) {
+                    byte b = channelBuffer.get(i);
+                    if (!Character.isWhitespace((char) b)) {
+                        throw new IllegalArgumentException(
+                                ("The fasta file %s is too long (relative to the index). In particular has a non-whitespace " +
+                                        "character (%c) at a position (%d) beyond the last base (%d), according to its index (%s)." +
+                                        " Please reindex the fasta.")
+                                        .formatted(fastaFile.toUri().toString(), (char) b, posOfInterest + i, lastSequenceEnd,index.toString()));
+                    }
+                }
+                posOfInterest += channelBuffer.limit();
+            }
+        }
+    }
+
+
     public FastaSequenceIndex getIndex() {
         return index;
     }
@@ -210,11 +260,12 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop
         final int bytesPerLine = indexEntry.getBytesPerLine();
         final int terminatorLength = bytesPerLine - basesPerLine;
 
-        long startOffset = ((start-1)/basesPerLine)*bytesPerLine + (start-1)%basesPerLine;
+        long startOffset = indexEntry.getOffset(start);
+
         // Cast to long so the second argument cannot overflow a signed integer.
         final long minBufferSize = Math.min((long) Defaults.NON_ZERO_BUFFER_SIZE, (long)(length / basesPerLine + 2) * (long)bytesPerLine);
         if (minBufferSize > Integer.MAX_VALUE) throw new SAMException("Buffer is too large: " +  minBufferSize);
-
+        
         // Allocate a buffer for reading in sequence data.
         final ByteBuffer channelBuffer = ByteBuffer.allocate((int)minBufferSize);
 

diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java
@@ -36,11 +36,7 @@
 import java.io.PrintStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.Objects;
-import java.util.Scanner;
+import java.util.*;
 import java.util.regex.MatchResult;
 
 /**
@@ -50,26 +46,31 @@ public class FastaSequenceIndex implements Iterable<FastaSequenceIndexEntry> {
     /**
      * Store the entries.  Use a LinkedHashMap for consistent iteration in insertion order.
      */
-    private final Map<String,FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<String,FastaSequenceIndexEntry>();
+    private final Map<String, FastaSequenceIndexEntry> sequenceEntries = new LinkedHashMap<>();
 
+    /** this member contains the name of the last index entry that was created during initialization. 
+     * Afterwards, subsequent operations (such as rename) might change the names and thus this member becomes unreliable.
+     */
+    private final String lastSequence;
+
     /**
      * Build a sequence index from the specified file.
      * @param indexFile File to open.
-     * @throws FileNotFoundException if the index file cannot be found.
+     * @throws SAMException if the index file cannot be found.
      */
-    public FastaSequenceIndex( File indexFile ) {
+    public FastaSequenceIndex( File indexFile ) throws SAMException {
         this(IOUtil.toPath(indexFile));
     }
 
     /**
      * Build a sequence index from the specified file.
      * @param indexFile File to open.
-     * @throws FileNotFoundException if the index file cannot be found.
+     * @throws SAMException if the index file cannot be found.
      */
-    public FastaSequenceIndex( Path indexFile ) {
+    public FastaSequenceIndex( Path indexFile ) throws SAMException { 
         IOUtil.assertFileIsReadable(indexFile);
         try (InputStream in = Files.newInputStream(indexFile)) {
-            parseIndexFile(in);
+            this.lastSequence = parseIndexFile(in);
         } catch (IOException e) {
             throw new SAMException("Fasta index file could not be opened: " + indexFile, e);
         }
@@ -80,13 +81,15 @@ public FastaSequenceIndex( Path indexFile ) {
      * @param in InputStream to read from.
      */
     public FastaSequenceIndex(InputStream in) {
-        parseIndexFile(in);
+        lastSequence=parseIndexFile(in);
     }
 
     /**
-     * Empty, protected constructor for unit testing.
+     * Empty, protected constructor for unit testing. Use with care, lastSequence will be incorrect.
      */
-    protected FastaSequenceIndex() {}
+    protected FastaSequenceIndex() {
+        lastSequence = "";
+    }
 
     /**
      * Add a new index entry to the list.  Protected for unit testing.
@@ -144,10 +147,13 @@ public int hashCode() {
     /**
      * Parse the contents of an index file, caching the results internally.
      * @param in InputStream to parse.
+     *           
+     * @return the name of the last contig (for the sanity-check)
      */
-    private void parseIndexFile(InputStream in) {
+    private String parseIndexFile(InputStream in) {
         try (Scanner scanner = new Scanner(in)) {
             int sequenceIndex = 0;
+            String lastContig = null;
             while( scanner.hasNext() ) {
                 // Tokenize and validate the index line.
                 String result = scanner.findInLine("(.+)\\t+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
@@ -170,7 +176,9 @@ private void parseIndexFile(InputStream in) {
                 contig = SAMSequenceRecord.truncateSequenceName(contig);
                 // Build sequence structure
                 add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) );
+                lastContig=contig;
             }
+            return lastContig;
         }
     }
 
@@ -233,4 +241,13 @@ public Iterator<FastaSequenceIndexEntry> iterator() {
     public int size() {
         return sequenceEntries.size();
     }
+
+
+    /**
+     * @return The name of the last entry that was added when parsing the index file. Only guarranteed to be correct just
+     * after initialization. Protected for access from AbstractIndexedFastaSequenceFile.
+     */
+    protected String getLastSequence() {
+        return this.lastSequence;
+    }
 }
diff --git a/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java b/src/main/java/htsjdk/samtools/reference/FastaSequenceIndexEntry.java
@@ -112,6 +112,26 @@ public int getSequenceIndex() {
         return sequenceIndex;
     }
 
+    /** Return the offset to pos as determined by the number of bases and bytes per line
+     *
+     * @param pos the (1-based) position in the contig that is requested
+     * @return the offset (0-based) from 'location' where pos is located in the file.
+     */
+    public long getOffset(long pos) {
+        final int basesPerLine = this.getBasesPerLine();
+        final int bytesPerLine = this.getBytesPerLine();
+
+        return ((pos - 1) / basesPerLine) * bytesPerLine + ((pos - 1) % basesPerLine);
+    }
+
+    /** get the terminator length from the bytes per line and the bases per line
+     *
+     * @return the length of the terminator of each line (1 or 2)
+     */
+    public int getTerminatorLength() {
+        return bytesPerLine - basesPerLine;
+    }
+
     /**
      * For debugging.  Emit the contents of each contig line.
      * @return A string representation of the contig line.

diff --git a/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java b/src/main/java/htsjdk/samtools/reference/IndexedFastaSequenceFile.java
@@ -81,6 +81,7 @@ public IndexedFastaSequenceFile(final Path path, final FastaSequenceIndex index)
                 throw new SAMException("Indexed block-compressed FASTA file cannot be handled: " + path);
             }
             this.channel = Files.newByteChannel(path);
+            sanityCheckFastaAgainstIndex(path,index);
         } catch (IOException e) {
             throw new SAMException("FASTA file should be readable but is not: " + path, e);
         }

diff --git a/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java b/src/test/java/htsjdk/samtools/reference/AbstractIndexedFastaSequenceFileTest.java
@@ -60,6 +60,11 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
     private static final File SEQUENCE_FILE_BGZ = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz");
     private static final File SEQUENCE_FILE_GZI = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.fasta.gz.gzi");
     private static final File SEQUENCE_FILE_NODICT = new File(TEST_DATA_DIR,"Homo_sapiens_assembly18.trimmed.nodict.fasta");
+    private static final Path HEADER_WITH_WHITESPACE = new File(TEST_DATA_DIR,"header_with_white_space.fasta").toPath();
+    private static final Path HEADER_WITH_EXTRA_WHITESPACE = new File(TEST_DATA_DIR,"header_with_extra_white_space.fasta").toPath();
+    private static final Path CRLF = new File(TEST_DATA_DIR,"crlf.fasta").toPath();
+    private static final Path HEADER_WITH_WHITESPACE_index = AbstractIndexedFastaSequenceFile.findFastaIndex(HEADER_WITH_WHITESPACE);
+    private static final Path CRLF_index = AbstractIndexedFastaSequenceFile.findFastaIndex(CRLF);
 
     private final String firstBasesOfChrM = "GATCACAGGTCTATCACCCT";
     private final String extendedBasesOfChrM = "GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT" +
@@ -68,6 +73,15 @@ public class AbstractIndexedFastaSequenceFileTest extends HtsjdkTest {
     private final String lastBasesOfChr20 = "ttgtctgatgctcatattgt";
     private final int CHR20_LENGTH = 1000000;
 
+    @DataProvider(name="mismatched_indexes")
+    public Object[][] provideMismatchedIndexes() {
+        return new Object[][]{
+                new Object[]{HEADER_WITH_WHITESPACE,CRLF_index},
+                new Object[]{CRLF,HEADER_WITH_WHITESPACE_index}
+        };
+    }
+
+
     @DataProvider(name="homosapiens")
     public Object[][] provideSequenceFile() throws FileNotFoundException {
         return new Object[][] { new Object[]
@@ -470,4 +484,24 @@ public void testBlockCompressedIndexedFastaSequenceFileFromNio() throws IOExcept
         }
     }
 
+    @Test(expectedExceptions = IllegalArgumentException.class,
+            dataProvider = "mismatched_indexes")
+    public void testWrongIndex(Path fasta, Path index) {
+        try (IndexedFastaSequenceFile indexedFastaSequenceFile = new IndexedFastaSequenceFile(fasta, new FastaSequenceIndex(index))) {
+
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }    
+
+    @Test
+    public void testExtraWhitespace() {
+        try (IndexedFastaSequenceFile indexedFastaSequenceFile = 
+                     new IndexedFastaSequenceFile(HEADER_WITH_EXTRA_WHITESPACE, 
+                             new FastaSequenceIndex(HEADER_WITH_WHITESPACE_index))) {
+
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
 }
diff --git a/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta b/src/test/resources/htsjdk/samtools/reference/header_with_extra_white_space.fasta
@@ -0,0 +1,7 @@
+>a test white space
+ACTG
+>b test whitespace
+ACTG
+
+
+