-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathBAMtoJunctionBED.py
executable file
·307 lines (277 loc) · 13.4 KB
/
BAMtoJunctionBED.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
###BAMtoJunctionBED
#Author Nathan Salomonis - [email protected]
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is furnished
#to do so, subject to the following conditions:
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
#INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
#PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
#HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
#OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""This script can be run on its own to extract a single BAM file at a time or
indirectly by multiBAMtoBED.py to extract junction.bed files (Tophat format)
from many BAM files in a single directory at once. Currently uses the Tophat
predicted Strand notation opt('XS') for each read. This can be substituted with
strand notations from other aligners (check with the software authors)."""
import pysam
import string,os,sys,copy,getopt
import time
import traceback
try: import export
except Exception: pass
try: import unique
except Exception: pass
try:
import TabProxies
import ctabix
import csamtools
import cvcf
except Exception:
try:
if os.name != 'posix': print traceback.format_exc()
except Exception: pass
def getSpliceSites(cigarList,X):
cummulative=0
coordinates=[]
for (code,seqlen) in cigarList:
if code == 0:
cummulative+=seqlen
if code == 3:
#if strand == '-':
five_prime_ss = str(X+cummulative)
cummulative+=seqlen ### add the intron length
three_prime_ss = str(X+cummulative+1) ### 3' exon start (prior exon splice-site + intron length)
coordinates.append([five_prime_ss,three_prime_ss])
up_to_intron_dist = cummulative
return coordinates, up_to_intron_dist
def writeJunctionBedFile(junction_db,jid,o):
strandStatus = True
for (chr,jc,tophat_strand) in junction_db:
if tophat_strand==None:
strandStatus = False
break
if strandStatus== False: ### If no strand information in the bam file filter and add known strand data
junction_db2={}
for (chr,jc,tophat_strand) in junction_db:
original_chr = chr
if 'chr' not in chr:
chr = 'chr'+chr
for j in jc:
try:
strand = splicesite_db[chr,j]
junction_db2[(original_chr,jc,strand)]=junction_db[(original_chr,jc,tophat_strand)]
except Exception: pass
junction_db = junction_db2
for (chr,jc,tophat_strand) in junction_db:
x_ls=[]; y_ls=[]; dist_ls=[]
read_count = str(len(junction_db[(chr,jc,tophat_strand)]))
for (X,Y,dist) in junction_db[(chr,jc,tophat_strand)]:
x_ls.append(X); y_ls.append(Y); dist_ls.append(dist)
outlier_start = min(x_ls); outlier_end = max(y_ls); dist = str(max(dist_ls))
exon_lengths = outlier_start
exon_lengths = str(int(jc[0])-outlier_start)+','+str(outlier_end-int(jc[1])+1)
junction_id = 'JUNC'+str(jid)+':'+jc[0]+'-'+jc[1] ### store the unique junction coordinates in the name
output_list = [chr,str(outlier_start),str(outlier_end),junction_id,read_count,tophat_strand,str(outlier_start),str(outlier_end),'255,0,0\t2',exon_lengths,'0,'+dist]
o.write(string.join(output_list,'\t')+'\n')
def writeIsoformFile(isoform_junctions,o):
for coord in isoform_junctions:
isoform_junctions[coord] = unique.unique(isoform_junctions[coord])
if '+' in coord:
print coord, isoform_junctions[coord]
if '+' in coord:
sys.exit()
def verifyFileLength(filename):
count = 0
try:
fn=filepath(filename)
for line in open(fn,'rU').xreadlines():
count+=1
if count>9: break
except Exception: null=[]
return count
def retreiveAllKnownSpliceSites():
### Uses a priori strand information when none present
import export, unique
chromosomes_found={}
parent_dir = export.findParentDir(bam_file)
species = None
for file in os.listdir(parent_dir):
if 'AltAnalyze_report' in file and '.log' in file:
log_file = unique.filepath(parent_dir+'/'+file)
log_contents = open(log_file, "rU")
species_tag = ' species: '
for line in log_contents:
line = line.rstrip()
if species_tag in line:
species = string.split(line,species_tag)[1]
if species == None:
species = IndicatedSpecies
splicesite_db={}
try:
exon_dir = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_exon.txt'
length = verifyFileLength(exon_dir)
except Exception:
length = 0
if length==0:
exon_dir = ExonReference
refExonCoordinateFile = unique.filepath(exon_dir)
firstLine=True
for line in open(refExonCoordinateFile,'rU').xreadlines():
if firstLine: firstLine=False
else:
line = line.rstrip('\n')
t = string.split(line,'\t'); #'gene', 'exon-id', 'chromosome', 'strand', 'exon-region-start(s)', 'exon-region-stop(s)', 'constitutive_call', 'ens_exon_ids', 'splice_events', 'splice_junctions'
geneID, exon, chr, strand, start, stop = t[:6]
#start = int(start); stop = int(stop)
#geneID = string.split(exon,':')[0]
splicesite_db[chr,start]=strand
splicesite_db[chr,stop]=strand
if len(chr)<5 or ('GL0' not in chr and 'GL' not in chr and 'JH' not in chr and 'MG' not in chr):
chromosomes_found[string.replace(chr,'chr','')] = []
return splicesite_db,chromosomes_found
def parseJunctionEntries(bam_dir,multi=False, Species=None, ReferenceDir=None):
global bam_file
global splicesite_db
global IndicatedSpecies
global ExonReference
IndicatedSpecies = Species
ExonReference = ReferenceDir
bam_file = bam_dir
try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites()
except Exception:
#print traceback.format_exc()
splicesite_db={}; chromosomes_found={}
start = time.time()
try: import collections; junction_db=collections.OrderedDict()
except Exception:
try: import ordereddict; junction_db = ordereddict.OrderedDict()
except Exception: junction_db={}
original_junction_db = copy.deepcopy(junction_db)
bamf = pysam.Samfile(bam_dir, "rb" )
### Is there are indexed .bai for the BAM? Check.
try:
for entry in bamf.fetch():
codes = map(lambda x: x[0],entry.cigar)
break
except Exception:
### Make BAM Index
if multi == False:
print 'Building BAM index file for', bam_dir
bam_dir = str(bam_dir)
#On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False
pysam.index(bam_dir)
bamf = pysam.Samfile(bam_dir, "rb" )
chromosome = False
chromosomes={}
count=0
jid = 1
prior_jc_start=0
l1 = None; l2=None
o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w")
o.write('track name=junctions description="TopHat junctions"\n')
export_isoform_models = False
if export_isoform_models:
io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w")
isoform_junctions = copy.deepcopy(junction_db)
outlier_start = 0; outlier_end = 0; read_count = 0; c=0
for entry in bamf.fetch():
try: cigarstring = entry.cigarstring
except Exception:
codes = map(lambda x: x[0],entry.cigar)
if 3 in codes: cigarstring = 'N'
else: cigarstring = None
if cigarstring != None:
if 'N' in cigarstring: ### Hence a junction
"""
if entry.cigar[0][1]<60 and entry.cigar[0][1]>20:
if count<310:
a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]]
a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6]
if l1==a1 and l2==a2: continue
else:
print entry.opt('XS'), a1,a2, entry.seq
l1 = a1; l2 = a2
else: sys.exit()
"""
if prior_jc_start == 0: pass
elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads
writeJunctionBedFile(junction_db,jid,o)
#writeIsoformFile(isoform_junctions,io)
junction_db = copy.deepcopy(original_junction_db) ### Re-set this object
jid+=1
chromosome = bamf.getrname( entry.rname )
chromosomes[chromosome]=[] ### keep track
X=entry.pos
Y=entry.pos+entry.alen
prior_jc_start = X
"""
if entry.is_reverse:
strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below)
else:
strand = '+' """
try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read
except Exception:
#if multi == False: print 'No TopHat strand information';sys.exit()
tophat_strand = None
coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X)
for (five_prime_ss,three_prime_ss) in coordinates:
jc = five_prime_ss,three_prime_ss
#print X, Y, jc, entry.cigarstring, entry.cigar
try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist])
except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]]
if export_isoform_models:
try:
mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI
if 'N' in mate.cigarstring:
mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos)
else: mate_coordinates=[]
except Exception: mate_coordinates=[]
#print coordinates,mate_coordinates
junctions = map(lambda x: tuple(x),coordinates)
if len(mate_coordinates)>0:
try:
isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates)
except Exception:
isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates]
else:
if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions:
isoform_junctions[chromosome,tuple(junctions),tophat_strand] = []
count+=1
writeJunctionBedFile(junction_db,jid,o) ### One last read-out
if multi == False:
print time.time()-start, 'seconds required to parse the BAM file'
o.close()
bamf.close()
missing_chromosomes=[]
for chr in chromosomes_found:
if chr not in chromosomes:
chr = string.replace(chr,'chr','')
if chr not in chromosomes_found:
if chr != 'M' and chr != 'MT':
missing_chromosomes.append(chr)
#missing_chromosomes = ['A','B','C','D']
try: bam_file = export.findFilename(bam_file)
except Exception: pass
return bam_file, missing_chromosomes
if __name__ == "__main__":
################ Comand-line arguments ################
if len(sys.argv[1:])<=1: ### Indicates that there are insufficient number of command-line arguments
print "Warning! Please designate a BAM file as input in the command-line"
print "Example: python BAMtoJunctionBED.py --i /Users/me/sample1.bam"
sys.exit()
else:
Species = None
options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','species=','r='])
for opt, arg in options:
if opt == '--i': bam_dir=arg ### full path of a BAM file
elif opt == '--species': Species=arg ### species for STAR analysis to get strand
elif opt == '--r': reference_dir=arg ### An exon.bed reference file (created by AltAnalyze from junctions, multiBAMtoBED.py or other) - required for STAR to get strand if XS field is empty
else:
print "Warning! Command-line argument: %s not recognized. Exiting..." % opt; sys.exit()
try: parseJunctionEntries(bam_dir,Species=Species,ReferenceDir=reference_dir)
except ZeroDivisionError:
print [sys.argv[1:]],'error'; error