9
9
from Bio .Seq import Seq
10
10
from operator import itemgetter
11
11
12
- from .type_constellations import load_feature_coordinates
12
+ from .type_constellations import load_feature_coordinates , resolve_ambiguous_cds
13
13
14
14
def parse_args ():
15
15
parser = argparse .ArgumentParser (description = """Pick a representative sample for each unique sequence""" ,
@@ -90,6 +90,15 @@ def update_var_dict(var_dict, group, variants):
90
90
return
91
91
92
92
93
+ def update_feature_dict (feature_dict ):
94
+ for feature in feature_dict :
95
+ if len (feature_dict [feature ]) > 2 :
96
+ cds , aa_pos = resolve_ambiguous_cds (feature_dict [feature ][2 ], feature_dict [feature ][0 ], feature_dict )
97
+ if aa_pos :
98
+ feature_dict [feature ] = (aa_pos , feature_dict [feature ][1 ] + feature_dict [feature ][0 ] - aa_pos , cds )
99
+ return feature_dict
100
+
101
+
93
102
def get_common_mutations (var_dict , min_occurance = 3 , threshold_common = 0.98 , threshold_intermediate = 0.25 ):
94
103
sorted_tuples = sorted (var_dict .items (), key = operator .itemgetter (1 ))
95
104
var_dict = {k : v for k , v in sorted_tuples }
@@ -110,7 +119,7 @@ def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, thres
110
119
return common , intermediate
111
120
112
121
113
- def translate_if_possible (nuc_start , nuc_ref , nuc_alt , feature_dict , reference_seq ):
122
+ def translate_if_possible (nuc_start , nuc_ref , nuc_alt , feature_dict , reference_seq , include_protein = False ):
114
123
nuc_end = nuc_start + len (nuc_ref )
115
124
nuc_start = int (nuc_start )
116
125
nuc_end = int (nuc_end )
@@ -138,12 +147,26 @@ def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_s
138
147
if ref_allele == query_allele :
139
148
return "nuc:%s%i%s" % (nuc_ref , nuc_start , nuc_alt )
140
149
aa_pos = int ((start - feature_dict [feature ][0 ]) / 3 ) + 1
150
+ if include_protein :
151
+ feature , aa_pos = translate_to_protein_if_possible (feature , aa_pos , feature_dict )
141
152
#print(start, end, ref_allele, query_allele, aa_pos, feature)
142
153
return "%s:%s%i%s" % (feature , ref_allele , aa_pos , query_allele )
143
154
return "nuc:%s%i%s" % (nuc_ref , nuc_start , nuc_alt )
144
155
145
156
146
- def define_mutations (list_variants , feature_dict , reference_seq ):
157
+ def translate_to_protein_if_possible (cds , aa_start , feature_dict ):
158
+ if not cds .startswith ("orf" ):
159
+ return cds , aa_start
160
+
161
+ for feature in feature_dict :
162
+ if len (feature_dict [feature ]) < 3 :
163
+ continue # only want nsp definitions
164
+ if feature_dict [feature ][2 ] == cds :
165
+ if feature_dict [feature ][0 ] <= aa_start <= feature_dict [feature ][1 ]:
166
+ return feature , aa_start - feature_dict [feature ][0 ]+ 1
167
+ return cds , aa_start
168
+
169
+ def define_mutations (list_variants , feature_dict , reference_seq , include_protein = False ):
147
170
merged_list = []
148
171
if not list_variants :
149
172
return merged_list
@@ -184,7 +207,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
184
207
elif new [3 ]:
185
208
current [3 ] = new [3 ]
186
209
elif current [0 ] != "" :
187
- var = translate_if_possible (current [1 ], current [0 ], current [2 ], feature_dict , reference_seq )
210
+ var = translate_if_possible (current [1 ], current [0 ], current [2 ], feature_dict , reference_seq , include_protein )
188
211
if freq :
189
212
merged_list .append ("%s:%s" % (var , freq ))
190
213
else :
@@ -193,7 +216,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
193
216
else :
194
217
current = new
195
218
if current [0 ] != "" :
196
- var = translate_if_possible (current [1 ], current [0 ], current [2 ], feature_dict , reference_seq )
219
+ var = translate_if_possible (current [1 ], current [0 ], current [2 ], feature_dict , reference_seq , include_protein )
197
220
if freq :
198
221
merged_list .append ("%s:%s" % (var , freq ))
199
222
else :
@@ -214,17 +237,15 @@ def subtract_outgroup(common, outgroup_common):
214
237
215
238
def write_constellation (prefix , group , list_variants , list_intermediates , list_ancestral ):
216
239
group_dict = {"name" : group , "sites" : list_variants , "intermediate" : list_intermediates ,
217
- "rules" : {"min_alt" : int (( len (list_variants ) + 1 ) / 4 ), "max_ref" : int (( len (list_variants ) - 1 ) / 4 ) }}
240
+ "rules" : {"min_alt" : max ( len (list_variants ) - 3 , min ( len (list_variants ), 3 )), "max_ref" : 3 }}
218
241
if list_ancestral :
219
242
group_dict ["ancestral" ] = list_ancestral
220
- group_dict ["rules" ]["min_alt" ] += int ((len (list_ancestral )+ 1 )/ 4 )
221
- group_dict ["rules" ]["max_ref" ] += int ((len (list_ancestral )- 1 )/ 4 )
222
243
with open ('%s/%s.json' % (prefix , group ), 'w' ) as outfile :
223
244
json .dump (group_dict , outfile , indent = 4 )
224
245
225
246
226
247
def extract_definitions (in_variants , in_groups , group_column , index_column , reference_json , prefix , subset ,
227
- threshold_common , threshold_intermediate , outgroup_file ):
248
+ threshold_common , threshold_intermediate , outgroup_file , include_protein ):
228
249
if not in_groups :
229
250
in_groups = in_variants
230
251
@@ -239,6 +260,7 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
239
260
group_dict = get_group_dict (in_groups , group_column , index_column , groups_to_get )
240
261
241
262
reference_seq , feature_dict = load_feature_coordinates (reference_json )
263
+ feature_dict = update_feature_dict (feature_dict )
242
264
243
265
var_dict = {}
244
266
outgroup_var_dict = {}
@@ -283,9 +305,9 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
283
305
if group in outgroup_var_dict :
284
306
outgroup_common , outgroup_intermediate = get_common_mutations (outgroup_var_dict [group ], min_occurance = 1 , threshold_common = threshold_common , threshold_intermediate = threshold_intermediate )
285
307
common , ancestral = subtract_outgroup (common , outgroup_common )
286
- nice_common = define_mutations (common , feature_dict , reference_seq )
287
- nice_intermediate = define_mutations (intermediate , feature_dict , reference_seq )
288
- nice_ancestral = define_mutations (ancestral , feature_dict , reference_seq )
308
+ nice_common = define_mutations (common , feature_dict , reference_seq , include_protein )
309
+ nice_intermediate = define_mutations (intermediate , feature_dict , reference_seq , include_protein )
310
+ nice_ancestral = define_mutations (ancestral , feature_dict , reference_seq , include_protein )
289
311
write_constellation (prefix , group , nice_common , nice_intermediate , nice_ancestral )
290
312
291
313
0 commit comments