Fixing projection issues.

holtgrewe · holtgrewe · commit 3dbfada6b385 · 2019-11-13T13:54:37.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,9 @@
 * Making faMT annotation for refSeq optional
 * Update RefSeq parser that does not run into null-pointer exceptions on mouse and rat genome (e.g. when no exon defines the gene name)
 * Fixing issue with block substitutions (#475).
+* Fixing RefSeq build to properly assign transcript model (use best match with alignment) in the case of duplicates.
+* Fixing issue in projection in the case of leading gaps (has no effect on CDS position prediction).
+* Adding `TranscriptModel.getTrimmedSequence()` that removes leading and trailing (unaligned in RefSeq) sequence.
 
 ### jannovar-htsjdk
 
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java
@@ -86,7 +86,7 @@ protected NucleotideChange getCDSNTChange() {
 		} catch (ProjectionException e) {
 			throw new Error("Bug: at this point, the position must be a transcript position");
 		}
-		if (DuplicationChecker.isDuplication(transcript.getSequence(), change.getAlt(), txPos.getPos())) {
+		if (DuplicationChecker.isDuplication(transcript.getTrimmedSequence(), change.getAlt(), txPos.getPos())) {
 			NucleotidePointLocationBuilder posBuilder = new NucleotidePointLocationBuilder(transcript);
 			if (change.getAlt().length() == 1) {
 				try {
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java
@@ -76,8 +76,8 @@ private Annotation buildCDSExonicAnnotation() {
 
 		// Check that the WT nucleotide from the transcript is consistent with change.ref and generate a warning message
 		// if this is not the case.
-		if (txPos.getPos() >= transcript.getSequence().length()
-			|| !transcript.getSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
+		if (txPos.getPos() >= transcript.getTrimmedSequence().length()
+			|| !transcript.getTrimmedSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
 			messages.add(AnnotationMessage.WARNING_REF_DOES_NOT_MATCH_TRANSCRIPT);
 
 		// Compute the frame shift and codon start position.
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java
@@ -119,7 +119,7 @@ public ImmutableList<TranscriptModel> run() throws TranscriptParseException {
 		// Load the FASTA file and assign to the builders.
 		final String pathFASTA = PathUtil.join(basePath, getINIFileName("rna"));
 		loadMitochondrialFASTA(builders);
-		
+
 		loadFASTA(builders, pathFASTA);
 
 		// Create final list of TranscriptModels.
@@ -158,13 +158,13 @@ private void loadMitochondrialFASTA(Map<String, TranscriptModelBuilder> builders
 			LOGGER.warn("Key for chrMT FASTA File does not exist, skipping.");
 			return;
 		}
-		
+
 		final String pathFasta = PathUtil.join(basePath, getINIFileName("faMT"));
 		if (!new File(pathFasta).exists()) {
 			LOGGER.warn("The chrMT FASTA File {} does not exist, skipping.", new Object[]{ pathFasta });
 			return;
 		}
-		
+
 
 		final int idMT = refDict.getContigNameToID().get("chrMT");
 
@@ -303,7 +303,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
 
 		Set<String> wantedTypes = Sets.newHashSet("exon", "CDS", "stop_codon");
 		boolean onlyCurated = onlyCurated();
-		
+
 		// Some exons do not have a gene name in mm9 and rat releases.
 		// Therefore we select collect the name from the gene.
 		Map<String, String> entrezMap = new HashMap<>();
@@ -327,13 +327,13 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
 				boolean isMT = contigDict.containsKey("chrMT") &&
 					contigDict.get("chrMT").equals(contigDict.get(record.getSeqID()));
 				if ("gene".equals(record.getType())) {
-					
+
 					String entrezId = parseGeneID(record);
 
 					String symbol = null;
 					symbol = record.getAttributes().get("gene");
 					entrezMap.put(entrezId, symbol);
-					
+
 					// Register chrMT Entrez IDs to the "MT..." gene name.
 					if (isMT) {
 						if (record.getAttributes().get("gene_synonym") == null) {
@@ -349,7 +349,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
 						mtEntrezMap.put(entrezId, symbol);
 					}
 				}
-				
+
 
 				// Handle the records describing the transcript structure.
 				//
@@ -380,12 +380,12 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
 							}
 
 							String entrezId = parseGeneID(record);
-							
+
 //							// skip on chrMT some tRNA that do not have any Dbxref. This happens in mm9 and rat v6
 //							if (record.getAttributes().get("gbkey") == "tRNA" && entrezId == null) {
 //								continue;
 //							}
-							
+
 							transcriptId = mtEntrezMap.get(entrezId);
 							builder.setAccession(transcriptId);
 						}
@@ -511,9 +511,9 @@ private TranscriptModelBuilder createNewTranscriptModelBuilder(FeatureRecord rec
 		builder.setTxVersion(record.getAttributes().get("transcript_version"));
 		String geneID = parseGeneID(record);
 		builder.setGeneID(geneID);
-		
+
 		String gene = record.getAttributes().get("gene");
-		
+
 		if (gene == null && geneID != null && notMtEntrezMap.containsKey(geneID)) {
 			gene = notMtEntrezMap.get(geneID);
 		}
@@ -646,14 +646,15 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
 				} else {
 					duplicatedTranscriptIdCount++;
 					// This is a tricky call - there are about 30 transcripts with duplicated transcriptIds due to imperfect
-					// alignment to the genomic reference
-					LOGGER.warn("Transcript {} has {} possible transcript models - using the longest model", transcriptId, parentIds
+					// alignment to the genomic reference.  We chose the one with the best match between alignment from cDNA_match
+					// and exons.
+					LOGGER.warn("Transcript {} has {} possible transcript models - picking best by cDNA_match", transcriptId, parentIds
 						.size());
 					List<TranscriptModelBuilder> possibleModels = parentIds.stream()
 						.map(transcriptModelsWithTxRegion::get)
 						.collect(toList());
-					TranscriptModelBuilder longestCds = findLongestTranscriptRegion(possibleModels);
-					transcriptIdToTranscriptModelBuilders.put(transcriptId, longestCds);
+					TranscriptModelBuilder bestModel = findBestModelByCdnaMatch(possibleModels);
+					transcriptIdToTranscriptModelBuilders.put(transcriptId, bestModel);
 				}
 			}
 		}
@@ -663,13 +664,45 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
 		return transcriptIdToTranscriptModelBuilders;
 	}
 
-	private TranscriptModelBuilder findLongestTranscriptRegion(List<TranscriptModelBuilder> possibleModels) {
-		TranscriptModelBuilder longestCds = possibleModels.get(0);
+	private static int countCdnaMatches(TranscriptModelBuilder model) {
+		int result = 0;
+		int i = 0;
+		for (AlignmentPart part : model.getAlignmentParts()) {
+			if (i >= model.getExonRegions().size()) {
+				continue;
+			}
+			final GenomeInterval exon = model.getExonRegions().get(i).withStrand(Strand.FWD);
+
+			final int partBegin;
+			final int partEnd;
+
+			if (part.refBeginPos < part.refEndPos) {
+				partBegin = part.refBeginPos;
+				partEnd = part.refEndPos;
+			} else {
+				partEnd = part.refBeginPos;
+				partBegin = part.refEndPos;
+			}
+
+			if (exon.getBeginPos() == partBegin && exon.getEndPos() == partEnd) {
+				result += 1;
+			}
+
+			i += 1;
+		}
+		return result;
+	}
+
+	private TranscriptModelBuilder findBestModelByCdnaMatch(List<TranscriptModelBuilder> possibleModels) {
+		TranscriptModelBuilder bestMatch = possibleModels.get(0);
+		int bestMatchCount = countCdnaMatches(bestMatch);
 		for (TranscriptModelBuilder current : possibleModels) {
-			if (current.getTXRegion().length() > longestCds.getTXRegion().length()) {
-				longestCds = current;
+			int currentMatchCount = countCdnaMatches(current);
+			if (currentMatchCount > bestMatchCount) {
+				bestMatch = current;
+				bestMatchCount = currentMatchCount;
 			}
 		}
-		return longestCds;
+		return bestMatch;
 	}
 }
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java
@@ -53,7 +53,7 @@ public static GenomeVariant normalizeInsertion(TranscriptModel transcript, Genom
 
 		// Insert the ALT bases at the position indicated by txPos.
 		int pos = txPos.getPos();
-		StringBuilder builder = new StringBuilder(transcript.getSequence());
+		StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
 		builder.insert(pos, change.getAlt());
 
 		// Execute algorithm and compute the shift.
@@ -106,7 +106,7 @@ public static GenomeVariant normalizeDeletion(TranscriptModel transcript, Genome
 		// Shift the deletion to the 3' (right) end of the transcript.
 		int pos = txPos.getPos();
 		final int LEN = change.getRef().length(); // length of the deletion
-		final String seq = transcript.getSequence();
+		final String seq = transcript.getTrimmedSequence();
 		int shift = 0;
 
 		while ((pos + LEN < seq.length()) && (seq.charAt(pos) == seq.charAt(pos + LEN))) {
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java
@@ -91,7 +91,7 @@ public TranscriptPosition getTranscriptEndPos() {
 	 */
 	@Override
 	public String toString() {
-		return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "-", endPos);
+		return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "_", endPos);
 	}
 
 	/*
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java
@@ -167,6 +167,15 @@ public String getSequence() {
 		return sequence;
 	}
 
+	/**
+	 * @return mDNA sequence of the spliced RNA of this known gene transcript with leading and
+	 * trailing sequences removed.
+	 */
+	public String getTrimmedSequence() {
+		final Alignment ali = seqAlignment;
+		return sequence.substring(ali.refLeadingGapLength(), sequence.length() - ali.refTrailingGapLength());
+	}
+
 	/**
 	 * @Return the sequence alignment to the exonic genome reference
 	 */
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java
@@ -47,7 +47,7 @@ public String getCDSTranscript() {
 		try {
 			TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
 			TranscriptPosition tEndPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeEndPos());
-			return transcript.getSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
+			return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
 		} catch (ProjectionException e) {
 			throw new Error("Bug: CDS begin/end must be translatable into transcript positions");
 		}
@@ -59,7 +59,7 @@ public String getCDSTranscript() {
 	public String getTranscriptStartingAtCDS() {
 		try {
 			TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
-			return transcript.getSequence().substring(tBeginPos.getPos(), transcript.getSequence().length());
+			return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), transcript.getTrimmedSequence().length());
 		} catch (ProjectionException e) {
 			throw new Error("Bug: CDS begin must be translatable into transcript positions");
 		}
@@ -88,7 +88,8 @@ public TranscriptPosition genomeToTranscriptPos(GenomePosition pos) throws Proje
 				int posInExon = pos.differenceTo(region.getGenomeBeginPos());
 				int transcriptPos = tOffset + posInExon;
 				// Project for position on genomic exons to position in sequence.
-				int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos);
+				int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos) -
+					transcript.getSeqAlignment().refLeadingGapLength();
 				return new TranscriptPosition(transcript, projectedTranscriptPos, PositionType.ZERO_BASED);
 			}
 			tOffset += region.length();
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java
@@ -50,7 +50,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
 		TranscriptSequenceOntologyDecorator soDecorator = new TranscriptSequenceOntologyDecorator(transcript);
 		if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval())
 			|| !soDecorator.overlapsWithExon(change.getGenomeInterval()))
-			return transcript.getSequence(); // non-coding change, does not affect transcript
+			return transcript.getTrimmedSequence(); // non-coding change, does not affect transcript
 
 		// Get transcript position for the change position.
 		TranscriptProjectionDecorator projector = new TranscriptProjectionDecorator(transcript);
@@ -62,7 +62,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
 		}
 
 		// Update base in string using StringBuilder.
-		StringBuilder builder = new StringBuilder(transcript.getSequence());
+		StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
 		if (change.getType() == GenomeVariantType.SNV)
 			builder.setCharAt(tPos.getPos(), change.getAlt().charAt(0));
 		else
@@ -73,7 +73,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
 	private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
 		// Short-circuit in the case of change that does not affect the transcript.
 		if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval()))
-			return transcript.getSequence();
+			return transcript.getTrimmedSequence();
 
 		// Get transcript begin and end position.
 		GenomePosition changeBeginPos = change.getGenomeInterval().getGenomeBeginPos();
@@ -94,7 +94,7 @@ private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
 		}
 
 		// Build resulting transcript string.
-		StringBuilder builder = new StringBuilder(transcript.getSequence());
+		StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
 		builder.delete(tBeginPos.getPos(), tEndPos.getPos());
 		builder.insert(tBeginPos.getPos(), change.getAlt());
 		return builder.toString();
diff --git a/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java b/jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java
@@ -80,10 +80,10 @@ public String getCodonAt(TranscriptPosition txPos, CDSPosition cdsPos) throws In
 		int frameShift = cdsPos.getPos() % 3;
 		int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
 		int endPos = codonStart + 3;
-		if (transcript.getSequence().length() < endPos)
+		if (transcript.getTrimmedSequence().length() < endPos)
 			throw new InvalidCodonException("Could not access codon " + codonStart + " - " + endPos
-				+ ", transcript sequence length is " + transcript.getSequence().length());
-		return transcript.getSequence().substring(codonStart, endPos);
+				+ ", transcript sequence length is " + transcript.getTrimmedSequence().length());
+		return transcript.getTrimmedSequence().substring(codonStart, endPos);
 	}
 
 	/**
@@ -102,9 +102,9 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
 		int frameShift = cdsPos.getPos() % 3;
 		int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
 		int endPos = codonStart + 3 * count;
-		if (endPos > transcript.getSequence().length())
-			endPos = transcript.getSequence().length();
-		return transcript.getSequence().substring(codonStart, endPos);
+		if (endPos > transcript.getTrimmedSequence().length())
+			endPos = transcript.getTrimmedSequence().length();
+		return transcript.getTrimmedSequence().substring(codonStart, endPos);
 	}
 
 	/**
@@ -116,7 +116,7 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
 	 * @return the codon affected by a change at the given position
 	 */
 	public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos) {
-		return getCodonsStartingFrom(txPos, cdsPos, transcript.getSequence().length());
+		return getCodonsStartingFrom(txPos, cdsPos, transcript.getTrimmedSequence().length());
 	}
 
 }
diff --git a/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java b/jannovar-core/src/test/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParserTest.java
@@ -414,16 +414,16 @@ private String printTranscriptModel(TranscriptModel transcriptModel) {
 		TranscriptProjectionDecorator tx0Decorator = new TranscriptProjectionDecorator(tx0);
 		TranscriptPosition tx0Pos0 = tx0Decorator
 			.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870425));
-		assertEquals(1, tx0Pos0.getPos());
+		assertEquals(0, tx0Pos0.getPos());
 		TranscriptPosition tx0Pos1 = tx0Decorator
 			.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870424));
-		assertEquals(2, tx0Pos1.getPos());
+		assertEquals(1, tx0Pos1.getPos());
 		TranscriptPosition tx0Pos2 = tx0Decorator
 			.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870423));
-		assertEquals(3, tx0Pos2.getPos());
+		assertEquals(2, tx0Pos2.getPos());
 		TranscriptPosition tx0Pos3 = tx0Decorator
 			.genomeToTranscriptPos(new GenomePosition(refDict, Strand.FWD, 1, 91870422));
-		assertEquals(4, tx0Pos3.getPos());
+		assertEquals(3, tx0Pos3.getPos());
 	}
 
 	/**

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ protected NucleotideChange getCDSNTChange() {`
`86`	`86`	`} catch (ProjectionException e) {`
`87`	`87`	`throw new Error("Bug: at this point, the position must be a transcript position");`
`88`	`88`	`}`
`89`		`- if (DuplicationChecker.isDuplication(transcript.getSequence(), change.getAlt(), txPos.getPos())) {`
	`89`	`+ if (DuplicationChecker.isDuplication(transcript.getTrimmedSequence(), change.getAlt(), txPos.getPos())) {`
`90`	`90`	`NucleotidePointLocationBuilder posBuilder = new NucleotidePointLocationBuilder(transcript);`
`91`	`91`	`if (change.getAlt().length() == 1) {`
`92`	`92`	`try {`
Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ public TranscriptPosition getTranscriptEndPos() {`
`91`	`91`	`*/`
`92`	`92`	`@Override`
`93`	`93`	`public String toString() {`
`94`		`- return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "-", endPos);`
	`94`	`+ return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "_", endPos);`
`95`	`95`	`}`
`96`	`96`
`97`	`97`	`/*`