Skip to content

Commit 3dbfada

Browse files
committed
Fixing projection issues.
1 parent 53fff52 commit 3dbfada

File tree

11 files changed

+90
-44
lines changed

11 files changed

+90
-44
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
* Making faMT annotation for refSeq optional
1616
* Update RefSeq parser that does not run into null-pointer exceptions on mouse and rat genome (e.g. when no exon defines the gene name)
1717
* Fixing issue with block substitutions (#475).
18+
* Fixing RefSeq build to properly assign transcript model (use best match with alignment) in the case of duplicates.
19+
* Fixing issue in projection in the case of leading gaps (has no effect on CDS position prediction).
20+
* Adding `TranscriptModel.getTrimmedSequence()` that removes leading and trailing (unaligned in RefSeq) sequence.
1821

1922
### jannovar-htsjdk
2023

jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/InsertionAnnotationBuilder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ protected NucleotideChange getCDSNTChange() {
8686
} catch (ProjectionException e) {
8787
throw new Error("Bug: at this point, the position must be a transcript position");
8888
}
89-
if (DuplicationChecker.isDuplication(transcript.getSequence(), change.getAlt(), txPos.getPos())) {
89+
if (DuplicationChecker.isDuplication(transcript.getTrimmedSequence(), change.getAlt(), txPos.getPos())) {
9090
NucleotidePointLocationBuilder posBuilder = new NucleotidePointLocationBuilder(transcript);
9191
if (change.getAlt().length() == 1) {
9292
try {

jannovar-core/src/main/java/de/charite/compbio/jannovar/annotation/builders/SNVAnnotationBuilder.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ private Annotation buildCDSExonicAnnotation() {
7676

7777
// Check that the WT nucleotide from the transcript is consistent with change.ref and generate a warning message
7878
// if this is not the case.
79-
if (txPos.getPos() >= transcript.getSequence().length()
80-
|| !transcript.getSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
79+
if (txPos.getPos() >= transcript.getTrimmedSequence().length()
80+
|| !transcript.getTrimmedSequence().substring(txPos.getPos(), txPos.getPos() + 1).equals(change.getRef()))
8181
messages.add(AnnotationMessage.WARNING_REF_DOES_NOT_MATCH_TRANSCRIPT);
8282

8383
// Compute the frame shift and codon start position.

jannovar-core/src/main/java/de/charite/compbio/jannovar/impl/parse/refseq/RefSeqParser.java

Lines changed: 53 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ public ImmutableList<TranscriptModel> run() throws TranscriptParseException {
119119
// Load the FASTA file and assign to the builders.
120120
final String pathFASTA = PathUtil.join(basePath, getINIFileName("rna"));
121121
loadMitochondrialFASTA(builders);
122-
122+
123123
loadFASTA(builders, pathFASTA);
124124

125125
// Create final list of TranscriptModels.
@@ -158,13 +158,13 @@ private void loadMitochondrialFASTA(Map<String, TranscriptModelBuilder> builders
158158
LOGGER.warn("Key for chrMT FASTA File does not exist, skipping.");
159159
return;
160160
}
161-
161+
162162
final String pathFasta = PathUtil.join(basePath, getINIFileName("faMT"));
163163
if (!new File(pathFasta).exists()) {
164164
LOGGER.warn("The chrMT FASTA File {} does not exist, skipping.", new Object[]{ pathFasta });
165165
return;
166166
}
167-
167+
168168

169169
final int idMT = refDict.getContigNameToID().get("chrMT");
170170

@@ -303,7 +303,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
303303

304304
Set<String> wantedTypes = Sets.newHashSet("exon", "CDS", "stop_codon");
305305
boolean onlyCurated = onlyCurated();
306-
306+
307307
// Some exons do not have a gene name in mm9 and rat releases.
308308
// Therefore we select collect the name from the gene.
309309
Map<String, String> entrezMap = new HashMap<>();
@@ -327,13 +327,13 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
327327
boolean isMT = contigDict.containsKey("chrMT") &&
328328
contigDict.get("chrMT").equals(contigDict.get(record.getSeqID()));
329329
if ("gene".equals(record.getType())) {
330-
330+
331331
String entrezId = parseGeneID(record);
332332

333333
String symbol = null;
334334
symbol = record.getAttributes().get("gene");
335335
entrezMap.put(entrezId, symbol);
336-
336+
337337
// Register chrMT Entrez IDs to the "MT..." gene name.
338338
if (isMT) {
339339
if (record.getAttributes().get("gene_synonym") == null) {
@@ -349,7 +349,7 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
349349
mtEntrezMap.put(entrezId, symbol);
350350
}
351351
}
352-
352+
353353

354354
// Handle the records describing the transcript structure.
355355
//
@@ -380,12 +380,12 @@ private Map<String, TranscriptModelBuilder> loadTranscriptModels(String pathGFF)
380380
}
381381

382382
String entrezId = parseGeneID(record);
383-
383+
384384
// // skip on chrMT some tRNA that do not have any Dbxref. This happens in mm9 and rat v6
385385
// if (record.getAttributes().get("gbkey") == "tRNA" && entrezId == null) {
386386
// continue;
387387
// }
388-
388+
389389
transcriptId = mtEntrezMap.get(entrezId);
390390
builder.setAccession(transcriptId);
391391
}
@@ -511,9 +511,9 @@ private TranscriptModelBuilder createNewTranscriptModelBuilder(FeatureRecord rec
511511
builder.setTxVersion(record.getAttributes().get("transcript_version"));
512512
String geneID = parseGeneID(record);
513513
builder.setGeneID(geneID);
514-
514+
515515
String gene = record.getAttributes().get("gene");
516-
516+
517517
if (gene == null && geneID != null && notMtEntrezMap.containsKey(geneID)) {
518518
gene = notMtEntrezMap.get(geneID);
519519
}
@@ -646,14 +646,15 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
646646
} else {
647647
duplicatedTranscriptIdCount++;
648648
// This is a tricky call - there are about 30 transcripts with duplicated transcriptIds due to imperfect
649-
// alignment to the genomic reference
650-
LOGGER.warn("Transcript {} has {} possible transcript models - using the longest model", transcriptId, parentIds
649+
// alignment to the genomic reference. We chose the one with the best match between alignment from cDNA_match
650+
// and exons.
651+
LOGGER.warn("Transcript {} has {} possible transcript models - picking best by cDNA_match", transcriptId, parentIds
651652
.size());
652653
List<TranscriptModelBuilder> possibleModels = parentIds.stream()
653654
.map(transcriptModelsWithTxRegion::get)
654655
.collect(toList());
655-
TranscriptModelBuilder longestCds = findLongestTranscriptRegion(possibleModels);
656-
transcriptIdToTranscriptModelBuilders.put(transcriptId, longestCds);
656+
TranscriptModelBuilder bestModel = findBestModelByCdnaMatch(possibleModels);
657+
transcriptIdToTranscriptModelBuilders.put(transcriptId, bestModel);
657658
}
658659
}
659660
}
@@ -663,13 +664,45 @@ private Map<String, TranscriptModelBuilder> mapNonRedundantTranscriptIdsToTransc
663664
return transcriptIdToTranscriptModelBuilders;
664665
}
665666

666-
private TranscriptModelBuilder findLongestTranscriptRegion(List<TranscriptModelBuilder> possibleModels) {
667-
TranscriptModelBuilder longestCds = possibleModels.get(0);
667+
private static int countCdnaMatches(TranscriptModelBuilder model) {
668+
int result = 0;
669+
int i = 0;
670+
for (AlignmentPart part : model.getAlignmentParts()) {
671+
if (i >= model.getExonRegions().size()) {
672+
continue;
673+
}
674+
final GenomeInterval exon = model.getExonRegions().get(i).withStrand(Strand.FWD);
675+
676+
final int partBegin;
677+
final int partEnd;
678+
679+
if (part.refBeginPos < part.refEndPos) {
680+
partBegin = part.refBeginPos;
681+
partEnd = part.refEndPos;
682+
} else {
683+
partEnd = part.refBeginPos;
684+
partBegin = part.refEndPos;
685+
}
686+
687+
if (exon.getBeginPos() == partBegin && exon.getEndPos() == partEnd) {
688+
result += 1;
689+
}
690+
691+
i += 1;
692+
}
693+
return result;
694+
}
695+
696+
private TranscriptModelBuilder findBestModelByCdnaMatch(List<TranscriptModelBuilder> possibleModels) {
697+
TranscriptModelBuilder bestMatch = possibleModels.get(0);
698+
int bestMatchCount = countCdnaMatches(bestMatch);
668699
for (TranscriptModelBuilder current : possibleModels) {
669-
if (current.getTXRegion().length() > longestCds.getTXRegion().length()) {
670-
longestCds = current;
700+
int currentMatchCount = countCdnaMatches(current);
701+
if (currentMatchCount > bestMatchCount) {
702+
bestMatch = current;
703+
bestMatchCount = currentMatchCount;
671704
}
672705
}
673-
return longestCds;
706+
return bestMatch;
674707
}
675708
}

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/GenomeVariantNormalizer.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ public static GenomeVariant normalizeInsertion(TranscriptModel transcript, Genom
5353

5454
// Insert the ALT bases at the position indicated by txPos.
5555
int pos = txPos.getPos();
56-
StringBuilder builder = new StringBuilder(transcript.getSequence());
56+
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
5757
builder.insert(pos, change.getAlt());
5858

5959
// Execute algorithm and compute the shift.
@@ -106,7 +106,7 @@ public static GenomeVariant normalizeDeletion(TranscriptModel transcript, Genome
106106
// Shift the deletion to the 3' (right) end of the transcript.
107107
int pos = txPos.getPos();
108108
final int LEN = change.getRef().length(); // length of the deletion
109-
final String seq = transcript.getSequence();
109+
final String seq = transcript.getTrimmedSequence();
110110
int shift = 0;
111111

112112
while ((pos + LEN < seq.length()) && (seq.charAt(pos) == seq.charAt(pos + LEN))) {

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptInterval.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ public TranscriptPosition getTranscriptEndPos() {
9191
*/
9292
@Override
9393
public String toString() {
94-
return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "-", endPos);
94+
return StringUtil.concatenate(transcript.getAccession(), ":n.", beginPos + 1, "_", endPos);
9595
}
9696

9797
/*

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptModel.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,15 @@ public String getSequence() {
167167
return sequence;
168168
}
169169

170+
/**
171+
* @return mDNA sequence of the spliced RNA of this known gene transcript with leading and
172+
* trailing sequences removed.
173+
*/
174+
public String getTrimmedSequence() {
175+
final Alignment ali = seqAlignment;
176+
return sequence.substring(ali.refLeadingGapLength(), sequence.length() - ali.refTrailingGapLength());
177+
}
178+
170179
/**
171180
* @Return the sequence alignment to the exonic genome reference
172181
*/

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptProjectionDecorator.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ public String getCDSTranscript() {
4747
try {
4848
TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
4949
TranscriptPosition tEndPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeEndPos());
50-
return transcript.getSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
50+
return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), tEndPos.getPos());
5151
} catch (ProjectionException e) {
5252
throw new Error("Bug: CDS begin/end must be translatable into transcript positions");
5353
}
@@ -59,7 +59,7 @@ public String getCDSTranscript() {
5959
public String getTranscriptStartingAtCDS() {
6060
try {
6161
TranscriptPosition tBeginPos = genomeToTranscriptPos(transcript.getCDSRegion().getGenomeBeginPos());
62-
return transcript.getSequence().substring(tBeginPos.getPos(), transcript.getSequence().length());
62+
return transcript.getTrimmedSequence().substring(tBeginPos.getPos(), transcript.getTrimmedSequence().length());
6363
} catch (ProjectionException e) {
6464
throw new Error("Bug: CDS begin must be translatable into transcript positions");
6565
}
@@ -88,7 +88,8 @@ public TranscriptPosition genomeToTranscriptPos(GenomePosition pos) throws Proje
8888
int posInExon = pos.differenceTo(region.getGenomeBeginPos());
8989
int transcriptPos = tOffset + posInExon;
9090
// Project for position on genomic exons to position in sequence.
91-
int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos);
91+
int projectedTranscriptPos = transcript.getSeqAlignment().projectRefToQry(transcriptPos) -
92+
transcript.getSeqAlignment().refLeadingGapLength();
9293
return new TranscriptPosition(transcript, projectedTranscriptPos, PositionType.ZERO_BASED);
9394
}
9495
tOffset += region.length();

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceChangeHelper.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
5050
TranscriptSequenceOntologyDecorator soDecorator = new TranscriptSequenceOntologyDecorator(transcript);
5151
if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval())
5252
|| !soDecorator.overlapsWithExon(change.getGenomeInterval()))
53-
return transcript.getSequence(); // non-coding change, does not affect transcript
53+
return transcript.getTrimmedSequence(); // non-coding change, does not affect transcript
5454

5555
// Get transcript position for the change position.
5656
TranscriptProjectionDecorator projector = new TranscriptProjectionDecorator(transcript);
@@ -62,7 +62,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
6262
}
6363

6464
// Update base in string using StringBuilder.
65-
StringBuilder builder = new StringBuilder(transcript.getSequence());
65+
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
6666
if (change.getType() == GenomeVariantType.SNV)
6767
builder.setCharAt(tPos.getPos(), change.getAlt().charAt(0));
6868
else
@@ -73,7 +73,7 @@ private String getTranscriptWithPointInRefAffected(GenomeVariant change) {
7373
private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
7474
// Short-circuit in the case of change that does not affect the transcript.
7575
if (!transcript.getTXRegion().overlapsWith(change.getGenomeInterval()))
76-
return transcript.getSequence();
76+
return transcript.getTrimmedSequence();
7777

7878
// Get transcript begin and end position.
7979
GenomePosition changeBeginPos = change.getGenomeInterval().getGenomeBeginPos();
@@ -94,7 +94,7 @@ private String getTranscriptWithRangeInRefAffected(GenomeVariant change) {
9494
}
9595

9696
// Build resulting transcript string.
97-
StringBuilder builder = new StringBuilder(transcript.getSequence());
97+
StringBuilder builder = new StringBuilder(transcript.getTrimmedSequence());
9898
builder.delete(tBeginPos.getPos(), tEndPos.getPos());
9999
builder.insert(tBeginPos.getPos(), change.getAlt());
100100
return builder.toString();

jannovar-core/src/main/java/de/charite/compbio/jannovar/reference/TranscriptSequenceDecorator.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,10 @@ public String getCodonAt(TranscriptPosition txPos, CDSPosition cdsPos) throws In
8080
int frameShift = cdsPos.getPos() % 3;
8181
int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
8282
int endPos = codonStart + 3;
83-
if (transcript.getSequence().length() < endPos)
83+
if (transcript.getTrimmedSequence().length() < endPos)
8484
throw new InvalidCodonException("Could not access codon " + codonStart + " - " + endPos
85-
+ ", transcript sequence length is " + transcript.getSequence().length());
86-
return transcript.getSequence().substring(codonStart, endPos);
85+
+ ", transcript sequence length is " + transcript.getTrimmedSequence().length());
86+
return transcript.getTrimmedSequence().substring(codonStart, endPos);
8787
}
8888

8989
/**
@@ -102,9 +102,9 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
102102
int frameShift = cdsPos.getPos() % 3;
103103
int codonStart = txPos.getPos() - frameShift; // codon start in transcript string
104104
int endPos = codonStart + 3 * count;
105-
if (endPos > transcript.getSequence().length())
106-
endPos = transcript.getSequence().length();
107-
return transcript.getSequence().substring(codonStart, endPos);
105+
if (endPos > transcript.getTrimmedSequence().length())
106+
endPos = transcript.getTrimmedSequence().length();
107+
return transcript.getTrimmedSequence().substring(codonStart, endPos);
108108
}
109109

110110
/**
@@ -116,7 +116,7 @@ public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos
116116
* @return the codon affected by a change at the given position
117117
*/
118118
public String getCodonsStartingFrom(TranscriptPosition txPos, CDSPosition cdsPos) {
119-
return getCodonsStartingFrom(txPos, cdsPos, transcript.getSequence().length());
119+
return getCodonsStartingFrom(txPos, cdsPos, transcript.getTrimmedSequence().length());
120120
}
121121

122122
}

0 commit comments

Comments
 (0)