Skip to content

Dev #31

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 19, 2021
2 changes: 1 addition & 1 deletion scorpio/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
_program = "scorpio"
__version__ = "0.3.12"
__version__ = "0.3.13"
10 changes: 9 additions & 1 deletion scorpio/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@ def main(sysargs = sys.argv[1:]):
"--append-genotypes", dest="append_genotypes", action="store_true",
help="Output a column per variant with the call"
)
subparser_haplotype.add_argument(
"--combination", dest="combination", action="store_true",
help="Combines the mutations for the specified constellations, and outputs a string across them all, with counts per found constellation"
)
subparser_haplotype.set_defaults(func=scorpio.subcommands.haplotype.run)

# _______________________________ report __________________________________#
Expand Down Expand Up @@ -154,6 +158,10 @@ def main(sysargs = sys.argv[1:]):
'--outgroups', dest='outgroups', required=False,
help='Two column CSV with group, and pipe separated list of outgroup sequence_names for that list. '
'Assumes outgroups will be in main input CSV')
subparser_define.add_argument(
"--protein", dest="protein", action="store_true",
help="Translates definition coordinates to proteins where possible"
)

subparser_define.set_defaults(func=scorpio.subcommands.define.run)

Expand Down Expand Up @@ -243,7 +251,7 @@ def main(sysargs = sys.argv[1:]):
if not args.reference_json:
args.reference_json = reference_json
logging.info("Found reference %s" %args.reference_json)
if not args.constellations:
if not args.constellations and args.command in ['haplotype', 'classify']:
args.constellations = list_constellation_files
logging.info("Found constellations:")
for c in args.constellations:
Expand Down
46 changes: 34 additions & 12 deletions scorpio/scripts/extract_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from Bio.Seq import Seq
from operator import itemgetter

from .type_constellations import load_feature_coordinates
from .type_constellations import load_feature_coordinates, resolve_ambiguous_cds

def parse_args():
parser = argparse.ArgumentParser(description="""Pick a representative sample for each unique sequence""",
Expand Down Expand Up @@ -90,6 +90,15 @@ def update_var_dict(var_dict, group, variants):
return


def update_feature_dict(feature_dict):
for feature in feature_dict:
if len(feature_dict[feature]) > 2:
cds, aa_pos = resolve_ambiguous_cds(feature_dict[feature][2], feature_dict[feature][0], feature_dict)
if aa_pos:
feature_dict[feature] = (aa_pos, feature_dict[feature][1] + feature_dict[feature][0] - aa_pos, cds)
return feature_dict


def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, threshold_intermediate=0.25):
sorted_tuples = sorted(var_dict.items(), key=operator.itemgetter(1))
var_dict = {k: v for k, v in sorted_tuples}
Expand All @@ -110,7 +119,7 @@ def get_common_mutations(var_dict, min_occurance=3, threshold_common=0.98, thres
return common, intermediate


def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq):
def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_seq, include_protein=False):
nuc_end = nuc_start + len(nuc_ref)
nuc_start = int(nuc_start)
nuc_end = int(nuc_end)
Expand Down Expand Up @@ -138,12 +147,26 @@ def translate_if_possible(nuc_start, nuc_ref, nuc_alt, feature_dict, reference_s
if ref_allele == query_allele:
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)
aa_pos = int((start - feature_dict[feature][0]) / 3) + 1
if include_protein:
feature, aa_pos = translate_to_protein_if_possible(feature, aa_pos, feature_dict)
#print(start, end, ref_allele, query_allele, aa_pos, feature)
return "%s:%s%i%s" % (feature, ref_allele, aa_pos, query_allele)
return "nuc:%s%i%s" % (nuc_ref, nuc_start, nuc_alt)


def define_mutations(list_variants, feature_dict, reference_seq):
def translate_to_protein_if_possible(cds, aa_start, feature_dict):
if not cds.startswith("orf"):
return cds, aa_start

for feature in feature_dict:
if len(feature_dict[feature]) < 3:
continue # only want nsp definitions
if feature_dict[feature][2] == cds:
if feature_dict[feature][0] <= aa_start <= feature_dict[feature][1]:
return feature, aa_start-feature_dict[feature][0]+1
return cds, aa_start

def define_mutations(list_variants, feature_dict, reference_seq, include_protein=False):
merged_list = []
if not list_variants:
return merged_list
Expand Down Expand Up @@ -184,7 +207,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
elif new[3]:
current[3] = new[3]
elif current[0] != "":
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
if freq:
merged_list.append("%s:%s" % (var, freq))
else:
Expand All @@ -193,7 +216,7 @@ def define_mutations(list_variants, feature_dict, reference_seq):
else:
current = new
if current[0] != "":
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq)
var = translate_if_possible(current[1], current[0], current[2], feature_dict, reference_seq, include_protein)
if freq:
merged_list.append("%s:%s" % (var, freq))
else:
Expand All @@ -214,17 +237,15 @@ def subtract_outgroup(common, outgroup_common):

def write_constellation(prefix, group, list_variants, list_intermediates, list_ancestral):
group_dict = {"name": group, "sites": list_variants, "intermediate": list_intermediates,
"rules": {"min_alt": int((len(list_variants) + 1) / 4), "max_ref": int((len(list_variants) - 1) / 4)}}
"rules": {"min_alt": max(len(list_variants) - 3, min(len(list_variants), 3)), "max_ref": 3}}
if list_ancestral:
group_dict["ancestral"] = list_ancestral
group_dict["rules"]["min_alt"] += int((len(list_ancestral)+1)/4)
group_dict["rules"]["max_ref"] += int((len(list_ancestral)-1)/4)
with open('%s/%s.json' % (prefix, group), 'w') as outfile:
json.dump(group_dict, outfile, indent=4)


def extract_definitions(in_variants, in_groups, group_column, index_column, reference_json, prefix, subset,
threshold_common, threshold_intermediate, outgroup_file):
threshold_common, threshold_intermediate, outgroup_file, include_protein):
if not in_groups:
in_groups = in_variants

Expand All @@ -239,6 +260,7 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
group_dict = get_group_dict(in_groups, group_column, index_column, groups_to_get)

reference_seq, feature_dict = load_feature_coordinates(reference_json)
feature_dict = update_feature_dict(feature_dict)

var_dict = {}
outgroup_var_dict = {}
Expand Down Expand Up @@ -283,9 +305,9 @@ def extract_definitions(in_variants, in_groups, group_column, index_column, refe
if group in outgroup_var_dict:
outgroup_common, outgroup_intermediate = get_common_mutations(outgroup_var_dict[group], min_occurance=1, threshold_common=threshold_common, threshold_intermediate=threshold_intermediate)
common, ancestral = subtract_outgroup(common, outgroup_common)
nice_common = define_mutations(common, feature_dict, reference_seq)
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq)
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq)
nice_common = define_mutations(common, feature_dict, reference_seq, include_protein)
nice_intermediate = define_mutations(intermediate, feature_dict, reference_seq, include_protein)
nice_ancestral = define_mutations(ancestral, feature_dict, reference_seq, include_protein)
write_constellation(prefix, group, nice_common, nice_intermediate, nice_ancestral)


Expand Down
Loading