Skip to content

Commit e12eb0e

Browse files
authored
Merge pull request #31 from cov-lineages/dev
Dev
2 parents 1278e79 + f4d1fa7 commit e12eb0e

File tree

11 files changed

+473
-177
lines changed

11 files changed

+473
-177
lines changed

scorpio/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
_program = "scorpio"
2-
__version__ = "0.3.12"
2+
__version__ = "0.3.13"

scorpio/__main__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ def main(sysargs = sys.argv[1:]):
109109
"--append-genotypes", dest="append_genotypes", action="store_true",
110110
help="Output a column per variant with the call"
111111
)
112+
subparser_haplotype.add_argument(
113+
"--combination", dest="combination", action="store_true",
114+
help="Combines the mutations for the specified constellations, and outputs a string across them all, with counts per found constellation"
115+
)
112116
subparser_haplotype.set_defaults(func=scorpio.subcommands.haplotype.run)
113117

114118
# _______________________________ report __________________________________#
@@ -154,6 +158,10 @@ def main(sysargs = sys.argv[1:]):
154158
'--outgroups', dest='outgroups', required=False,
155159
help='Two column CSV with group, and pipe separated list of outgroup sequence_names for that list. '
156160
'Assumes outgroups will be in main input CSV')
161+
subparser_define.add_argument(
162+
"--protein", dest="protein", action="store_true",
163+
help="Translates definition coordinates to proteins where possible"
164+
)
157165

158166
subparser_define.set_defaults(func=scorpio.subcommands.define.run)
159167

@@ -243,7 +251,7 @@ def main(sysargs = sys.argv[1:]):
243251
if not args.reference_json:
244252
args.reference_json = reference_json
245253
logging.info("Found reference %s" %args.reference_json)
246-
if not args.constellations:
254+
if not args.constellations and args.command in ['haplotype', 'classify']:
247255
args.constellations = list_constellation_files
248256
logging.info("Found constellations:")
249257
for c in args.constellations:

scorpio/scripts/extract_definitions.py

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from Bio.Seq import Seq
1010
from operator import itemgetter
1111

12-
from .type_constellations import load_feature_coordinates
12+
from .type_constellations import load_feature_coordinates, resolve_ambiguous_cds
1313

1414
def parse_args():
1515
parser = argparse.ArgumentParser(description="""Pick a representative sample for each unique sequence""",
@@ -90,6 +90,15 @@ def update_var_dict(var_dict, group, variants):
9090
return
9191

9292

93+
def update_feature_dict(feature_dict):
94+
for feature in feature_dict:
95+
if len(feature_dict[feature]) > 2:
96+
cds, aa_pos = resolve_ambiguous_cds(feature_dict[feature][2], feature_dict[feature][0], feature_dict)
97+
if aa_pos:
98+
feature_dict[feature] = (aa_pos, feature_dict[feature][1] + feature_dict[feature][0] - aa_pos, cds)
99+
return feature_dict
100+
101+
93102
def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, threshold_intermediate=0.25):
94103
sorted_tuples = sorted(var_dict.items(), key=operator.itemgetter(1))
95104
var_dict = {k: v for k, v in sorted_tuples}
@@ -110,7 +119,7 @@ def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, thres
110119
return common, intermediate
111120

112121

113-
def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq):
122+
def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq, include_protein=False):
114123
nuc_end = nuc_start + len(nuc_ref)
115124
nuc_start = int(nuc_start)
116125
nuc_end = int(nuc_end)
@@ -138,12 +147,26 @@ def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_s
138147
if ref_allele == query_allele:
139148
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)
140149
aa_pos = int((start - feature_dict[feature][0]) / 3) + 1
150+
if include_protein:
151+
feature, aa_pos = translate_to_protein_if_possible(feature, aa_pos, feature_dict)
141152
#print(start, end, ref_allele, query_allele, aa_pos, feature)
142153
return "%s:%s%i%s" % (feature, ref_allele, aa_pos, query_allele)
143154
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)
144155

145156

146-
def define_mutations(list_variants, feature_dict, reference_seq):
157+
def translate_to_protein_if_possible(cds, aa_start, feature_dict):
158+
if not cds.startswith("orf"):
159+
return cds, aa_start
160+
161+
for feature in feature_dict:
162+
if len(feature_dict[feature]) < 3:
163+
continue # only want nsp definitions
164+
if feature_dict[feature][2] == cds:
165+
if feature_dict[feature][0] <= aa_start <= feature_dict[feature][1]:
166+
return feature, aa_start-feature_dict[feature][0]+1
167+
return cds, aa_start
168+
169+
def define_mutations(list_variants, feature_dict, reference_seq, include_protein=False):
147170
merged_list = []
148171
if not list_variants:
149172
return merged_list
@@ -184,7 +207,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
184207
elif new[3]:
185208
current[3] = new[3]
186209
elif current[0] != "":
187-
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
210+
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
188211
if freq:
189212
merged_list.append("%s:%s" % (var, freq))
190213
else:
@@ -193,7 +216,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
193216
else:
194217
current = new
195218
if current[0] != "":
196-
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
219+
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
197220
if freq:
198221
merged_list.append("%s:%s" % (var, freq))
199222
else:
@@ -214,17 +237,15 @@ def subtract_outgroup(common, outgroup_common):
214237

215238
def write_constellation(prefix, group, list_variants, list_intermediates, list_ancestral):
216239
group_dict = {"name": group, "sites": list_variants, "intermediate": list_intermediates,
217-
"rules": {"min_alt": int((len(list_variants) + 1) / 4), "max_ref": int((len(list_variants) - 1) / 4)}}
240+
"rules": {"min_alt": max(len(list_variants) - 3, min(len(list_variants), 3)), "max_ref": 3}}
218241
if list_ancestral:
219242
group_dict["ancestral"] = list_ancestral
220-
group_dict["rules"]["min_alt"] += int((len(list_ancestral)+1)/4)
221-
group_dict["rules"]["max_ref"] += int((len(list_ancestral)-1)/4)
222243
with open('%s/%s.json' % (prefix, group), 'w') as outfile:
223244
json.dump(group_dict, outfile, indent=4)
224245

225246

226247
def extract_definitions(in_variants, in_groups, group_column, index_column, reference_json, prefix, subset,
227-
threshold_common, threshold_intermediate, outgroup_file):
248+
threshold_common, threshold_intermediate, outgroup_file, include_protein):
228249
if not in_groups:
229250
in_groups = in_variants
230251

@@ -239,6 +260,7 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
239260
group_dict = get_group_dict(in_groups, group_column, index_column, groups_to_get)
240261

241262
reference_seq, feature_dict = load_feature_coordinates(reference_json)
263+
feature_dict = update_feature_dict(feature_dict)
242264

243265
var_dict = {}
244266
outgroup_var_dict = {}
@@ -283,9 +305,9 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
283305
if group in outgroup_var_dict:
284306
outgroup_common, outgroup_intermediate = get_common_mutations(outgroup_var_dict[group], min_occurance=1, threshold_common=threshold_common, threshold_intermediate=threshold_intermediate)
285307
common, ancestral = subtract_outgroup(common, outgroup_common)
286-
nice_common = define_mutations(common, feature_dict, reference_seq)
287-
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq)
288-
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq)
308+
nice_common = define_mutations(common, feature_dict, reference_seq, include_protein)
309+
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq, include_protein)
310+
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq, include_protein)
289311
write_constellation(prefix, group, nice_common, nice_intermediate, nice_ancestral)
290312

291313

0 commit comments

Comments
 (0)