diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f56d988..3742446d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#251](https://github.com/nf-core/funcscan/pull/251) Added annotation tool: Pyrodigal. (by @jasmezz) - [#252](https://github.com/nf-core/funcscan/pull/252) Added a new parameter `-arg_rgi_savejson` that saves the file `.json` in the RGI directory. The default ouput for RGI is now only `.txt`. (by @darcy220606) - [#253](https://github.com/nf-core/funcscan/pull/253) Updated Prodigal to have compressed output files. (by @jasmezz) +- [#262](https://github.com/nf-core/funcscan/pull/262) Added comBGC function to screen whole directory of antiSMASH output (one subfolder per sample). (by @jasmezz) ### `Fixed` diff --git a/bin/comBGC.py b/bin/comBGC.py index ee703619..53cd64a6 100755 --- a/bin/comBGC.py +++ b/bin/comBGC.py @@ -32,7 +32,7 @@ SOFTWARE. """ -tool_version = "0.5" +tool_version = "0.6.0" welcome = """\ ........................ * comBGC v.{version} * @@ -61,7 +61,9 @@ these can be: - antiSMASH: .gbk and (optional) knownclusterblast/ directory - DeepBGC: .bgc.tsv -- GECCO: .clusters.tsv""", +- GECCO: .clusters.tsv +Note: Please provide files from a single sample only. If you would like to +summarize multiple samples, please see the --antismash_multiple_samples flag.""", ) parser.add_argument( "-o", @@ -73,6 +75,16 @@ type=str, default=".", ) +parser.add_argument( + "-a", + "--antismash_multiple_samples", + metavar="PATH", + dest="antismash_multiple_samples", + nargs="?", + help="""directory of antiSMASH output. Should contain subfolders (one per +sample). Can only be used if --input is not specified.""", + type=str, +) parser.add_argument("-vv", "--verbose", help="increase output verbosity", action="store_true") parser.add_argument("-v", "--version", help="show version number and exit", action="store_true") @@ -81,6 +93,7 @@ # Assign input arguments to variables input = args.input +dir_antismash = args.antismash_multiple_samples outdir = args.outdir verbose = args.verbose version = args.version @@ -111,8 +124,13 @@ elif path.endswith("knownclusterblast/"): input_antismash.append(path) +if input and dir_antismash: + exit( + "The flags --input and --antismash_multiple_samples are mutually exclusive.\nPlease use only one of them (or see --help for how to use)." + ) + # Make sure that at least one input argument is given -if not (input_antismash or input_gecco or input_deepbgc): +if not (input_antismash or input_gecco or input_deepbgc or dir_antismash): exit("Please specify at least one input file (i.e. output from antismash, deepbgc, or gecco) or see --help") ######################## @@ -120,6 +138,24 @@ ######################## +def prepare_multisample_input_antismash(antismash_dir): + """ + Prepare string of input paths of a given antiSMASH output folder (with sample subdirectories) + """ + sample_paths = [] + for root, subdirs, files in os.walk(antismash_dir): + antismash_file = "/".join([root, "index.html"]) + if os.path.exists(antismash_file): + sample = root.split("/")[-1] + gbk_path = "/".join([root, sample]) + ".gbk" + kkb_path = "/".join([root, "knownclusterblast"]) + if os.path.exists(kkb_path): + sample_paths.append([gbk_path, kkb_path]) + else: + sample_paths.append([gbk_path]) + return sample_paths + + def parse_knownclusterblast(kcb_file_path): """ Extract MIBiG IDs from knownclusterblast TXT file. @@ -148,9 +184,6 @@ def antismash_workflow(antismash_paths): - Return data frame with aggregated info. """ - if verbose: - print("\nParsing antiSMASH files\n... ", end="") - antismash_sum_cols = [ "Sample_ID", "Prediction_tool", @@ -186,6 +219,9 @@ def antismash_workflow(antismash_paths): # Aggregate information Sample_ID = gbk_path.split("/")[-1].split(".gbk")[-2] # Assuming file name equals sample name + if verbose: + print("\nParsing antiSMASH file(s): " + Sample_ID + "\n... ", end="") + with open(gbk_path) as gbk: for record in SeqIO.parse(gbk, "genbank"): # GBK records are contigs in this case # Initiate variables per contig @@ -514,7 +550,13 @@ def gecco_workflow(gecco_paths): ######################## if __name__ == "__main__": - tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco} + if input_antismash: + tools = {"antiSMASH": input_antismash, "deepBGC": input_deepbgc, "GECCO": input_gecco} + elif dir_antismash: + tools = {"antiSMASH": dir_antismash} + else: + tools = {"deepBGC": input_deepbgc, "GECCO": input_gecco} + tools_provided = {} for tool in tools.keys(): @@ -532,7 +574,13 @@ def gecco_workflow(gecco_paths): for tool in tools_provided.keys(): if tool == "antiSMASH": - summary_antismash = antismash_workflow(input_antismash) + if dir_antismash: + antismash_paths = prepare_multisample_input_antismash(dir_antismash) + for input_antismash in antismash_paths: + summary_antismash_temp = antismash_workflow(input_antismash) + summary_antismash = pd.concat([summary_antismash, summary_antismash_temp]) + else: + summary_antismash = antismash_workflow(input_antismash) elif tool == "deepBGC": summary_deepbgc = deepbgc_workflow(input_deepbgc) elif tool == "GECCO":