diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5b8befd9..54522ec3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ concurrency: jobs: test: - name: Run pipeline with test data (AMP and ARG workflows) + name: Run pipeline with test data (AMP/ARG) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -27,9 +27,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions + - "-profile docker,test_preannotated --annotation_tool prodigal" + - "-profile docker,test --annotation_tool prokka" + - "-profile docker,test --annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions steps: - name: Check out pipeline code @@ -45,10 +45,10 @@ jobs: - name: Run pipeline with test data (AMP and ARG workflows) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }} + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results test_bgc: - name: Run pipeline with test data (BGC workflow) + name: Run pipeline with test data (BGC) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -58,9 +58,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + - "-profile docker,test_preannotated_bgc --annotation_tool prodigal" + - "-profile docker,test_bgc --annotation_tool prokka" + - "-profile docker,test_bgc --annotation_tool bakta --annotation_bakta_db_downloadtype light" steps: - name: Check out pipeline code @@ -76,10 +76,10 @@ jobs: - name: Run pipeline with test data (BGC workflow) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results --bgc_skip_deepbgc test_taxonomy: - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + name: Run pipeline with test data (AMP, ARG and BGC taxonomy) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -89,9 +89,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + - "-profile docker,test_taxonomy --annotation_tool prodigal" # TODO: Add test_taxonomy_preannotated.config + - "-profile docker,test_taxonomy --annotation_tool prokka" + - "-profile docker,test_taxonomy --annotation_tool bakta --annotation_bakta_db_downloadtype light" steps: - name: Check out pipeline code @@ -107,4 +107,4 @@ jobs: - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }} + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ef451bf..601e35ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606) - [#358](https://github.com/nf-core/funcscan/pull/358) Improved RGI databases handling, users can supply their own CARD now. (by @jasmezz) - [#375](https://github.com/nf-core/funcscan/pull/375) Merged pipeline template of nf-core/tools version 2.14.1 (by @jfy133) +- [#340](https://github.com/nf-core/funcscan/pull/340) Added support for supplying pre-annotated sequences to the pipeline. (by @jfy133, @jasmezz) ### `Fixed` diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index efe5277c..45ddd48c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -10,6 +10,14 @@ report_section_order: "nf-core-funcscan-summary": order: -1002 +run_modules: + - prokka + - custom_content + +table_columns_visible: + Prokka: + organism: False + export_plots: true disable_version_detection: true diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 22583f22..791912cd 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fasta -sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz -sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz +sample,fasta,protein,gbk +sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk +sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz +sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index 757969c2..25efc523 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -18,9 +18,27 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$", - "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fasta', '.fas', '.fa' or '.fna' (any of these can be optionally compressed as '.gz')", + "pattern": "^\\S+\\.(fasta|fas|fna|fa)(\\.gz)?$", + "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fa.gz', '.fna.gz' or '.fasta.gz'", "unique": true + }, + "protein": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(faa)(\\.gz)?$", + "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa", + "unique": true, + "dependentRequired": ["gbk"] + }, + "gbk": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(gbk|gbff)(\\.gz)?$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gbff", + "unique": true, + "dependentRequired": ["protein"] } }, "required": ["sample", "fasta"] diff --git a/conf/base.config b/conf/base.config index c3d2523f..32c67616 100644 --- a/conf/base.config +++ b/conf/base.config @@ -79,11 +79,6 @@ process { time = { check_max( 8.h * task.attempt, 'time' ) } } - withName: PRODIGAL_GFF { - memory = { check_max( 2.GB * task.attempt, 'memory' ) } - cpus = 1 - } - withName: PRODIGAL_GBK { memory = { check_max( 2.GB * task.attempt, 'memory' ) } cpus = 1 diff --git a/conf/modules.config b/conf/modules.config index 2f63d961..b1904daf 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -110,6 +110,7 @@ process { } withName: PROKKA { + ext.prefix = { "${meta.id}_prokka" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prokka/" }, mode: params.publish_dir_mode, @@ -128,7 +129,7 @@ process { params.annotation_prokka_rawproduct ? '--rawproduct' : '', params.annotation_prokka_rnammer ? '--rnammer' : '', params.annotation_prokka_compliant ? '--compliant' : '', - params.annotation_prokka_addgenes ? '--addgenes' : '' + params.annotation_prokka_addgenes ? '--addgenes' : '', ].join(' ').trim() } @@ -145,6 +146,7 @@ process { } withName: BAKTA_BAKTA { + ext.prefix = { "${meta.id}_bakta" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/bakta/${meta.id}" }, mode: params.publish_dir_mode, @@ -174,28 +176,13 @@ process { ].join(' ').trim() } - withName: PRODIGAL_GFF { + withName: PRODIGAL { + ext.prefix = { "${meta.id}_prodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.{faa,fna,gff}.gz", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ext.args = [ - params.annotation_prodigal_singlemode ? "-p single" : "-p meta", - params.annotation_prodigal_closed ? "-c" : "", - params.annotation_prodigal_forcenonsd ? "-n" : "", - "-g ${params.annotation_prodigal_transtable}" - ].join(' ').trim() - } - - withName: PRODIGAL_GBK { - publishDir = [ - path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, - mode: params.publish_dir_mode, - enabled: params.save_annotations, - pattern: "*.gbk.gz", + pattern: "*.{faa,fna,gbk,faa.gz,faa.gz,fna.gz,gbk.gz}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ @@ -207,11 +194,12 @@ process { } withName: PYRODIGAL { + ext.prefix = { "${meta.id}_pyrodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/pyrodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.{faa,fna,gff,score}.gz", + pattern: "*.{faa,fna,gbk,score}.gz", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ @@ -287,6 +275,7 @@ process { } withName: FARGENE { + tag = {"${meta.id}|${hmm_model}"} publishDir = [ [ path: { "${params.outdir}/arg/fargene/${meta.id}" }, diff --git a/conf/test.config b/conf/test.config index 887f3528..f1345093 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,7 @@ params { input = params.pipelines_testdata_base_path + 'funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = params.pipelines_testdata_base_path + 'funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = true arg_fargene_hmmmodel = 'class_a,class_b_1_2' diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 89228579..c5e816ee 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -23,9 +23,13 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = false run_amp_screening = false run_bgc_screening = true + + // Set scores so deepBGC can actually find a hit so comBGC is actually executed + bgc_deepbgc_score = 0.1 + } diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 34fdd49a..9a3118a2 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -4,10 +4,8 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. - Although in this case we turn everything off - Use as follows: - nextflow run nf-core/funcscan -profile test, --outdir + nextflow run nf-core/funcscan -profile test_nothing, --outdir ---------------------------------------------------------------------------------------- */ @@ -24,10 +22,31 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = false run_amp_screening = false run_bgc_screening = false + + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + + amp_skip_amplify = true + amp_skip_macrel = true + amp_skip_ampir = true + amp_skip_hmmsearch = true + + arg_skip_deeparg = true + arg_skip_fargene = true + arg_skip_rgi = true + arg_skip_amrfinderplus = true + arg_skip_deeparg = true + arg_skip_abricate = true + + bgc_skip_antismash = true + bgc_skip_deepbgc = true + bgc_skip_gecco = true + bgc_skip_hmmsearch = true + } diff --git a/conf/test_preannotated.config b/conf/test_preannotated.config new file mode 100644 index 00000000..09ccb0cf --- /dev/null +++ b/conf/test_preannotated.config @@ -0,0 +1,32 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile - preannotated input' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + + annotation_tool = 'pyrodigal' + + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + + run_amp_screening = true +} diff --git a/conf/test_preannotated_bgc.config b/conf/test_preannotated_bgc.config new file mode 100644 index 00000000..e56d6519 --- /dev/null +++ b/conf/test_preannotated_bgc.config @@ -0,0 +1,31 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_bgc, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'BGC test profile - preannotated input BGC' + config_profile_description = 'Minimal test dataset to check BGC workflow function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' + + annotation_tool = 'pyrodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = true +} diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config index ad477b3c..2e0cab02 100644 --- a/conf/test_taxonomy.config +++ b/conf/test_taxonomy.config @@ -25,7 +25,7 @@ params { amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' run_taxa_classification = true - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = true arg_skip_deeparg = true diff --git a/docs/output.md b/docs/output.md index 65c73a42..ae542df3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,7 +15,7 @@ Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://githu Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. :::info -Note that (unannotated) input contigs will be split into two categories per sample: long and short. Each sample will thus get two sets of results for each ARG/AMP screening (suffixed with `_long` and `_short` respectively, assuming contigs remain above/below the threshold), whereas for BGC results only `_long` will exist. This is because BGCs can only be reliability screened with longer contigs. +Note that if running the BGC subworkflow (unannotated) input contigs will be split into two categories per sample: long and short. Each sample will thus get two sets of results for each ARG/AMP screening (suffixed with `_long` and `_short` respectively, assuming contigs remain above/below the threshold), whereas for BGC results only `_long` will exist. This is because BGCs can only be reliability screened with longer contigs. The threshold for the separation can be adjusted with `--contig_qc_lengththreshold `. ::: @@ -149,7 +149,6 @@ Output Summaries: - `prodigal/` - `/`: - - `*.gff`: annotation in GFF3 format, containing both sequences and annotations - `*.fna`: nucleotide FASTA file of the input contig sequences - `*.faa`: protein FASTA file of the translated CDS sequences - `*.gbk`: annotation in GBK format, containing both sequences and annotations @@ -167,9 +166,10 @@ Output Summaries: - `pyrodigal/` - `/`: - - `*.gff`: annotation in GFF3 format, containing both sequences and annotations - - `*.fna`: nucleotide FASTA file of the input contig sequences + - `*.gbk`: annotation in GBK format, containing both sequences and annotations + - `*.fna`: nucleotide FASTA file of the annotated CDS sequences - `*.faa`: protein FASTA file of the translated CDS sequences + - `*.score.gz`: all potential genes (with scores) > Descriptions taken from the [Pyrodigal documentation](https://pyrodigal.readthedocs.io/) diff --git a/docs/usage.md b/docs/usage.md index f102cb1c..bb9814d2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -52,25 +52,41 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s --input '[path to samplesheet file]' ``` -The input samplesheet has to be a comma-separated file (`.csv`) with 2 columns (`sample`, and `fasta`), and a header row as shown in the examples below. +The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `gbk`), and a header row as shown in the examples below. -```bash +If you already have annotated contigs with peptide sequences and an annotation file in Genbank format (`.gbk.` or `.gbff`), you can supply these to the pipeline using the optional `protein` and `gbk` columns. If these additional columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. + +For two columns (without pre-annotated data): + +```csv title="samplesheet.csv" sample,fasta sample_1,///wastewater_metagenome_contigs_1.fasta.gz sample_2,///wastewater_metagenome_contigs_2.fasta.gz ``` -| Column | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +For four columns (with pre-annotated data): + +```csv title="samplesheet.csv" +sample,fasta,protein,gbk +sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gbk +sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk +``` + +| Column | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | +| `gbk` | Optional path to a pre-generated annotation file in Genbank format (`.gbk`, or `.gbff`) format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -:::warning -We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. +:::danger +We highly recommend performing quality control on input contigs before running the pipeline. -For example, by default BGC screening requires contigs of at least 3,000 bp (see `--contig_qc_lengththreshold`). +For example, **for un-annotated** input if running the BGC screening subworkflow, nf-core/funcscan will by default filter for the BGC subworkflow to screen only contigs with at least 3,000 bp length or more (see `--contig_qc_lengththreshold`). This will split the input contigs into two files: one with contigs of sufficient length for BGC screening and one with contigs below the threshold. Only the former will go for BGC screening, whereas both short and long are used for AMP and ARG screening. Thus when running the BGC subworkflow, all output files will be labelled with the suffix `_long` or `_short` to indicate the length of the contigs. + +In contrast, no such filtering nor relabelling is performed for the **pre-annotated** input sent to the BGC screening subworkflow! If you have pre-annotated contigs, make sure they, and the corresponding annotation files, contain sufficiently high quality and length for the type of molecule to be screened for. ::: ## Notes on screening tools and taxonomic classification @@ -99,11 +115,15 @@ antiSMASH has a minimum contig parameter, in which only contigs of a certain len To prevent entire pipeline failures due to a single 'bad sample', nf-core/funcscan will filter out any input sample in which none of the contigs reach the minimum contig length in bp specified with `--bgc_antismash_sampleminlength` (default: 1000). -> ⚠️ If a sample does not reach this contig length threshold, you will receive a warning in your console and in the `.nextflow.log` file, and no result files will exist for this sample in your results directory for this tool. +:::warning +If a sample does not reach this contig length threshold, you will receive a warning in your console and in the `.nextflow.log` file, and no result files will exist for this sample in your results directory for this tool. +::: When the annotation is run with Prokka, the resulting `.gbk` file passed to antiSMASH may produce the error `translation longer than location allows` and end the pipeline run. This Prokka bug has been reported before (see [discussion on GitHub](https://github.com/antismash/antismash/discussions/450)) and is not likely to be fixed soon. -> ⚠️ If antiSMASH is run for BGC detection, we recommend to **not** run Prokka for annotation but instead use the default annotation tool (Pyrodigal) or switch to Prodigal, or (for bacteria only!) Bakta. +:::warning +If antiSMASH is run for BGC detection, we recommend to **not** run Prokka for annotation but instead use the default annotation tool (Pyrodigal) or switch > to Prodigal, or (for bacteria only!) Bakta. +:::warning ## Databases and reference files @@ -111,9 +131,12 @@ Various tools of nf-core/funcscan use databases and reference files to operate. nf-core/funcscan offers the functionality to auto-download databases for you, and as these databases can be very large, and we suggest to store these files in a central place from where you can reuse them across pipeline runs. -We **highly recommend** allowing the pipeline to download these databases for you on a first run, saving these to your results directory with `--save_databases`, then moving these to a different location (in case you wish to delete the results directory of this first run). An exception to this is HMM files where no auto-downloading functionality is possible. +If your infrastructure has internet access (particularly on compute nodes), we **highly recommend** allowing the pipeline to download these databases for you on a first run, saving these to your results directory with `--save_databases`, then moving these to a different location (in case you wish to delete the results directory of this first run). An exception to this is HMM files where no auto-downloading functionality is possible. + +:::warning -> ⚠️ We generally do not recommend downloading the databases yourself, as this can often be non-trivial to do! +> We generally do not recommend downloading the databases yourself, as this can often be non-trivial to do! +> ::: As a reference, we will describe below where and how you can obtain databases and reference files used for tools included in the pipeline. @@ -135,7 +158,9 @@ And then passed to the pipeline with: --annotation_bakta_db_localpath ///db/ ``` -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### hmmsearch @@ -190,7 +215,9 @@ To obtain a local version of the database: -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### DeepARG @@ -219,7 +246,9 @@ You can then supply the path to resulting database directory with: Note that if you supply your own database that is not downloaded by the pipeline, make sure to also supply `--arg_deeparg_data_version` along with the version number so hAMRonization will correctly display the database version in the summary report. -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### RGI @@ -234,7 +263,9 @@ You can then supply the path to resulting database directory with: --arg_rgi_database '////' ``` -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### antiSMASH @@ -257,9 +288,13 @@ To supply the database directories to the pipeline: Note that the names of the supplied folders must differ from each other (e.g. `antismash_db` and `antismash_dir`). If they are not provided, the databases will be auto-downloaded upon each BGC screening run of the pipeline. -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: -> ℹ️ If installing with conda, the installation directory will be `lib/python3.10/site-packages/antismash` from the base directory of your conda install or conda environment directory. +:::info +If installing with conda, the installation directory will be `lib/python3.10/site-packages/antismash` from the base directory of your conda install or conda environment directory. +::: ### DeepBGC diff --git a/nextflow.config b/nextflow.config index 4047b6be..119628bc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -350,12 +350,14 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_bgc { includeConfig 'conf/test_bgc.config' } - test_taxonomy { includeConfig 'conf/test_taxonomy.config' } - test_full { includeConfig 'conf/test_full.config' } - test_deeparg { includeConfig 'conf/test_deeparg.config' } - test_nothing { includeConfig 'conf/test_nothing.config' } + test { includeConfig 'conf/test.config' } + test_bgc { includeConfig 'conf/test_bgc.config' } + test_taxonomy { includeConfig 'conf/test_taxonomy.config' } + test_full { includeConfig 'conf/test_full.config' } + test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_preannotated { includeConfig 'conf/test_preannotated.config' } + test_preannotated_bgc { includeConfig 'conf/test_preannotated_bgc.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 14256fb0..c4fabb99 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -274,7 +274,7 @@ "default": "?", "enum": ["+", "-", "?"], "description": "Specify the type of bacteria to be annotated to detect signaling peptides.", - "help_text": "Specify the type of bacteria expected in the input dataset for correct annotation of the signal peptide predictions. More details can be found in the [documentation](https://github.com/oschwengers/bakta/blob/main/README.md#usage).\n\n> Modifies tool parameter(s):\n> - BAKTA: `--gram`", + "help_text": "Specify the type of bacteria expected in the input dataset for correct annotation of the signal peptide predictions. Gram types: +/-/?\nMore details can be found in the [documentation](https://github.com/oschwengers/bakta/blob/main/README.md#usage).\n\n> Modifies tool parameter(s):\n> - BAKTA: `--gram`", "fa_icon": "far fa-plus-square" }, "annotation_bakta_complete": { diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 30f4a171..47dec041 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -14,9 +14,9 @@ include { MERGE_TAXONOMY_AMPCOMBI } from '.. workflow AMP { take: - contigs // tuple val(meta), path(contigs) - faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + fastas // tuple val(meta), path(contigs) + faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -27,10 +27,10 @@ workflow AMP { // When adding new tool that requires FAA, make sure to update conditions // in funcscan.nf around annotation and AMP subworkflow execution // to ensure annotation is executed! - ch_faa_for_amplify = faa - ch_faa_for_amp_hmmsearch = faa - ch_faa_for_ampir = faa - ch_faa_for_ampcombi = faa + ch_faa_for_amplify = faas + ch_faa_for_amp_hmmsearch = faas + ch_faa_for_ampir = faas + ch_faa_for_ampcombi = faas // AMPLIFY if ( !params.amp_skip_amplify ) { @@ -41,7 +41,7 @@ workflow AMP { // MACREL if ( !params.amp_skip_macrel ) { - MACREL_CONTIGS ( contigs ) + MACREL_CONTIGS ( fastas ) ch_versions = ch_versions.mix( MACREL_CONTIGS.out.versions ) GUNZIP_MACREL_PRED ( MACREL_CONTIGS.out.amp_prediction ) GUNZIP_MACREL_ORFS ( MACREL_CONTIGS.out.all_orfs ) @@ -71,14 +71,15 @@ workflow AMP { [ meta, file ] } - ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch.combine( ch_amp_hmm_models_meta ) - .map { - meta_faa, faa, meta_hmm, hmm -> - def meta_new = [:] - meta_new['id'] = meta_faa['id'] - meta_new['hmm_id'] = meta_hmm['id'] - [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] - } + ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch + .combine( ch_amp_hmm_models_meta ) + .map { + meta_faa, faa, meta_hmm, hmm -> + def meta_new = [:] + meta_new['id'] = meta_faa['id'] + meta_new['hmm_id'] = meta_hmm['id'] + [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] + } AMP_HMMER_HMMSEARCH ( ch_in_for_amp_hmmsearch ) ch_versions = ch_versions.mix( AMP_HMMER_HMMSEARCH.out.versions ) @@ -107,15 +108,15 @@ workflow AMP { //AMPCOMBI concatenation if ( !params.run_taxa_classification ) { - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true ) + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.csv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true ) } else { - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', keepHeader:true ) + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.csv', keepHeader:true ) } // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf new file mode 100644 index 00000000..74bf9666 --- /dev/null +++ b/subworkflows/local/annotation.nf @@ -0,0 +1,93 @@ +/* + Run annotation tools +*/ + +include { PROKKA } from '../../modules/nf-core/prokka/main' +include { PRODIGAL } from '../../modules/nf-core/prodigal/main' +include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' +include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' +include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' +include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' + +workflow ANNOTATION { + take: + fasta // tuple val(meta), path(contigs) + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.annotation_tool == "pyrodigal" || ( params.annotation_tool == "prodigal" && params.run_bgc_screening == true && !params.bgc_skip_antismash ) ) { // Need to use pyrodigal for antiSMASH because prodigal GBK annotation format is incompatible with antiSMASH. + + if ( params.annotation_tool == "prodigal" && params.run_bgc_screening == true && !params.bgc_skip_antismash ) { + log.warn("[nf-core/funcscan] Switching annotation tool to: pyrodigal. This is because prodigal annotations (in GBK format) are incompatible with antiSMASH. If you specifically wish to run prodigal instead, please skip antiSMASH or provide a pre-annotated GBK file in the samplesheet.") + } + + PYRODIGAL ( fasta, "gbk" ) + GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) + GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) + GUNZIP_PYRODIGAL_GBK ( PYRODIGAL.out.annotations ) + ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GBK.out.versions) + ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip + ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip + + } else if ( params.annotation_tool == "prodigal" ) { + + PRODIGAL ( fasta, "gbk" ) + GUNZIP_PRODIGAL_FAA ( PRODIGAL.out.amino_acid_fasta ) + GUNZIP_PRODIGAL_FNA ( PRODIGAL.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) + ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip + ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip + + } else if ( params.annotation_tool == "prokka" ) { + + PROKKA ( fasta, [], [] ) + ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_multiqc_files = PROKKA.out.txt.collect{it[1]}.ifEmpty([]) + ch_annotation_faa = PROKKA.out.faa + ch_annotation_fna = PROKKA.out.fna + ch_annotation_gbk = PROKKA.out.gbk + + } else if ( params.annotation_tool == "bakta" ) { + + // BAKTA prepare download + if ( params.annotation_bakta_db_localpath ) { + ch_bakta_db = Channel + .fromPath( params.annotation_bakta_db_localpath ) + .first() + } else { + BAKTA_BAKTADBDOWNLOAD ( ) + ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) + ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) + } + + BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) + ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) + ch_multiqc_files = BAKTA_BAKTA.out.txt.collect{it[1]}.ifEmpty([]) + ch_annotation_faa = BAKTA_BAKTA.out.faa + ch_annotation_fna = BAKTA_BAKTA.out.fna + ch_annotation_gbk = BAKTA_BAKTA.out.gbff + } + + emit: + versions = ch_versions + multiqc_files = ch_multiqc_files + faa = ch_annotation_faa // [ [meta], path(faa) ] + fna = ch_annotation_fna // [ [meta], path(fna) ] + gbk = ch_annotation_gbk // [ [meta], path(gbk) ] +} diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index f56a1d67..49d536d6 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -22,9 +22,9 @@ include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_t workflow ARG { take: - contigs // tuple val(meta), path(contigs) + fastas // tuple val(meta), path(contigs) annotations - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -45,7 +45,7 @@ workflow ARG { } if ( !params.arg_skip_amrfinderplus ) { - AMRFINDERPLUS_RUN ( contigs, ch_amrfinderplus_db ) + AMRFINDERPLUS_RUN ( fastas, ch_amrfinderplus_db ) ch_versions = ch_versions.mix( AMRFINDERPLUS_RUN.out.versions ) // Reporting @@ -59,20 +59,20 @@ workflow ARG { ch_fargene_classes = Channel.fromList( params.arg_fargene_hmmmodel.tokenize(',') ) - ch_fargene_input = contigs + ch_fargene_input = fastas .combine( ch_fargene_classes ) .map { - meta, contigs, hmm_class -> + meta, fastas, hmm_class -> def meta_new = meta.clone() meta_new['hmm_class'] = hmm_class - [ meta_new, contigs, hmm_class ] + [ meta_new, fastas, hmm_class ] } .multiMap { - contigs: [ it[0], it[1] ] + fastas: [ it[0], it[1] ] hmmclass: it[2] } - FARGENE ( ch_fargene_input.contigs, ch_fargene_input.hmmclass ) + FARGENE ( ch_fargene_input.fastas, ch_fargene_input.hmmclass ) ch_versions = ch_versions.mix( FARGENE.out.versions ) // Reporting @@ -102,7 +102,7 @@ workflow ARG { RGI_CARDANNOTATION ( rgi_database ) ch_versions = ch_versions.mix( RGI_CARDANNOTATION.out.versions ) - RGI_MAIN ( contigs, RGI_CARDANNOTATION.out.db, [] ) + RGI_MAIN ( fastas, RGI_CARDANNOTATION.out.db, [] ) ch_versions = ch_versions.mix( RGI_MAIN.out.versions ) // Reporting @@ -149,7 +149,7 @@ workflow ARG { // ABRicate run if ( !params.arg_skip_abricate ) { - ABRICATE_RUN ( contigs ) + ABRICATE_RUN ( fastas ) ch_versions = ch_versions.mix( ABRICATE_RUN.out.versions ) HAMRONIZATION_ABRICATE ( ABRICATE_RUN.out.report, 'json', '1.0.1', '2021-Mar-27' ) @@ -170,7 +170,7 @@ workflow ARG { // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_HAMRONIZATION( HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list ) ch_versions = ch_versions.mix( MERGE_TAXONOMY_HAMRONIZATION.out.versions ) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 48266303..089f57f7 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -18,11 +18,10 @@ include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/m workflow BGC { take: - fna // tuple val(meta), path(PREPPED_INPUT.out.fna) - gff // tuple val(meta), path(.out.gff) - faa // tuple val(meta), path(.out.faa) - gbk // tuple val(meta), path(.out.gbk) - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + fastas // tuple val(meta), path(PREPPED_INPUT.out.fna) + faas // tuple val(meta), path(.out.faa) + gbks // tuple val(meta), path(.out.gbk) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -31,7 +30,7 @@ workflow BGC { // When adding new tool that requires FAA, make sure to update conditions // in funcscan.nf around annotation and AMP subworkflow execution // to ensure annotation is executed! - ch_faa_for_bgc_hmmsearch = faa + ch_faa_for_bgc_hmmsearch = faas // ANTISMASH if ( !params.bgc_skip_antismash ) { @@ -71,35 +70,16 @@ workflow BGC { } - if ( params.annotation_tool == 'prodigal' || params.annotation_tool == "pyrodigal" ) { - - ch_antismash_input = fna.join(gff, by: 0) - .multiMap { - meta, fna, gff -> - fna: [ meta, fna ] - gff: [ gff ] - } - - ANTISMASH_ANTISMASHLITE ( ch_antismash_input.fna, ch_antismash_databases, ch_antismash_directory, ch_antismash_input.gff ) - - } else if ( params.annotation_tool == 'prokka' ) { - - ANTISMASH_ANTISMASHLITE ( gbk, ch_antismash_databases, ch_antismash_directory, [] ) - - } else if ( params.annotation_tool == 'bakta' ) { - - ANTISMASH_ANTISMASHLITE ( gbk, ch_antismash_databases, ch_antismash_directory, [] ) - - } - + ANTISMASH_ANTISMASHLITE ( gbks, ch_antismash_databases, ch_antismash_directory, [] ) ch_versions = ch_versions.mix( ANTISMASH_ANTISMASHLITE.out.versions ) ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir - .mix( ANTISMASH_ANTISMASHLITE.out.gbk_input ) - .groupTuple() - .map{ - meta, files -> - [meta, files.flatten()] - } + .mix( ANTISMASH_ANTISMASHLITE.out.gbk_input ) + .groupTuple() + .map{ + meta, files -> + [meta, files.flatten()] + } + ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) } @@ -116,14 +96,14 @@ workflow BGC { ch_versions = ch_versions.mix( DEEPBGC_DOWNLOAD.out.versions ) } - DEEPBGC_PIPELINE ( fna, ch_deepbgc_database) + DEEPBGC_PIPELINE ( fastas, ch_deepbgc_database) ch_versions = ch_versions.mix( DEEPBGC_PIPELINE.out.versions ) ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( DEEPBGC_PIPELINE.out.bgc_tsv ) } // GECCO if ( !params.bgc_skip_gecco ) { - ch_gecco_input = fna.groupTuple() + ch_gecco_input = fastas.groupTuple() .multiMap { fna: [ it[0], it[1], [] ] } @@ -180,7 +160,7 @@ workflow BGC { // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_COMBGC( ch_combgc_summaries, ch_mmseqs_taxonomy_list ) ch_versions = ch_versions.mix( MERGE_TAXONOMY_COMBGC.out.versions ) diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index ec9f273a..898f570f 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -36,15 +36,20 @@ workflow TAXA_CLASS { // MMSEQS_CREATEDB MMSEQS_CREATEDB ( contigs ) ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) - ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db // MMSEQS_TAXONOMY - MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) + MMSEQS_TAXONOMY ( MMSEQS_CREATEDB.out.db, ch_mmseqs_db ) ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions ) - ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + + ch_taxonomy_input_for_createtsv = MMSEQS_CREATEDB.out.db + .join(MMSEQS_TAXONOMY.out.db_taxonomy) + .multiMap { meta, db, db_taxonomy -> + db: [ meta,db ] + taxdb: [ meta, db_taxonomy ] + } // MMSEQS_CREATETSV - MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) + MMSEQS_CREATETSV ( ch_taxonomy_input_for_createtsv.taxdb, [[:],[]], ch_taxonomy_input_for_createtsv.db ) ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions ) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv } diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 5d74b7c7..e2c26d1a 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -152,7 +152,7 @@ def validateInputParameters() { // 3. Give warning if not using container system assuming conda - if ( params.run_bgc_screening && ( !params.bgc_antismash_databases || !params.bgc_antismash_installationdirectory ) && !params.bgc_skip_antismash && ( session.config.conda && session.config.conda.enabled ) ) { log.warn "[nf-core/funcscan] Running antiSMASH download database module, and detected conda has been enabled. Assuming using conda for pipeline run, check config if this is not expected!" } + if ( params.run_bgc_screening && ( !params.bgc_antismash_databases || !params.bgc_antismash_installationdirectory ) && !params.bgc_skip_antismash && ( session.config.conda && session.config.conda.enabled ) ) { log.warn "[nf-core/funcscan] Running antiSMASH download database module, and detected conda has been enabled. Assuming using conda for pipeline run. Check config if this is not expected!" } } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 1635c7c7..1e33e9ef 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -29,9 +29,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' +include { ANNOTATION } from '../subworkflows/local/annotation' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* @@ -43,21 +44,8 @@ include { TAXA_CLASS } from '../subworkflows/local/taxa_class' // // MODULE: Installed directly from nf-core/modules // -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { GUNZIP as GUNZIP_FASTA_PREP } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../modules/nf-core/gunzip/main' -include { PROKKA } from '../modules/nf-core/prokka/main' -include { PRODIGAL as PRODIGAL_GFF } from '../modules/nf-core/prodigal/main' -include { PRODIGAL as PRODIGAL_GBK } from '../modules/nf-core/prodigal/main' -include { PYRODIGAL as PYRODIGAL_GBK } from '../modules/nf-core/pyrodigal/main' -include { PYRODIGAL as PYRODIGAL_GFF } from '../modules/nf-core/pyrodigal/main' -include { BAKTA_BAKTADBDOWNLOAD } from '../modules/nf-core/bakta/baktadbdownload/main' -include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' +include { GUNZIP as GUNZIP_INPUT_PREP } from '../modules/nf-core/gunzip/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' include { SEQKIT_SEQ as SEQKIT_SEQ_LONG } from '../modules/nf-core/seqkit/seq/main' include { SEQKIT_SEQ as SEQKIT_SEQ_SHORT } from '../modules/nf-core/seqkit/seq/main' @@ -79,43 +67,129 @@ workflow FUNCSCAN { ch_input = Channel.fromSamplesheet("input") + /////////////////////// + // INPUT PREPARATION // + /////////////////////// + // Some tools require uncompressed input - fasta_prep = ch_input - .branch { - compressed: it[1].toString().endsWith('.gz') - uncompressed: it[1] - } + ch_input_prep = ch_input + .map{meta, fasta, faa, gbk -> [meta, [fasta, faa, gbk]]} + .transpose() + .branch { + compressed: it[1].toString().endsWith('.gz') + uncompressed: it[1] + } - GUNZIP_FASTA_PREP ( fasta_prep.compressed ) - ch_versions = ch_versions.mix( GUNZIP_FASTA_PREP.out.versions ) + GUNZIP_INPUT_PREP ( ch_input_prep.compressed ) + ch_versions = ch_versions.mix( GUNZIP_INPUT_PREP.out.versions ) // Merge all the already uncompressed and newly compressed FASTAs here into // a single input channel for downstream - ch_unzipped_fastas = GUNZIP_FASTA_PREP.out.gunzip - .mix( fasta_prep.uncompressed ) + ch_intermediate_input = GUNZIP_INPUT_PREP.out.gunzip + .mix( ch_input_prep.uncompressed ) + .groupTuple() + .map{ + meta, files -> + def fasta_found = files.find{it.toString().tokenize('.').last().matches('fasta|fas|fna|fa')} + def faa_found = files.find{it.toString().endsWith('.faa')} + def gbk_found = files.find{it.toString().tokenize('.').last().matches('gbk|gbff')} + def fasta = fasta_found != null ? fasta_found : [] + def faa = faa_found != null ? faa_found : [] + def gbk = gbk_found != null ? gbk_found : [] + + [meta, fasta, faa, gbk] + } + .branch { + meta, fasta, faa, gbk -> + preannotated: gbk != [] + fastas: true + } // Split each FASTA into long and short contigs to // speed up BGC workflow with BGC-compatible contig lengths only - SEQKIT_SEQ_LONG ( ch_unzipped_fastas ) - SEQKIT_SEQ_SHORT ( ch_unzipped_fastas ) - ch_versions = ch_versions.mix( SEQKIT_SEQ_LONG.out.versions ) - ch_versions = ch_versions.mix( SEQKIT_SEQ_SHORT.out.versions ) + // Only if BGC screening is enabled! + if ( params.run_bgc_screening) { + + ch_intermediate_fasta_for_split = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } + SEQKIT_SEQ_LONG ( ch_intermediate_fasta_for_split ) + SEQKIT_SEQ_SHORT ( ch_intermediate_fasta_for_split ) + ch_versions = ch_versions.mix( SEQKIT_SEQ_LONG.out.versions ) + ch_versions = ch_versions.mix( SEQKIT_SEQ_SHORT.out.versions ) + + ch_intermediate_input_long = SEQKIT_SEQ_LONG.out.fastx + .map{ meta, file -> [ meta + [id: meta.id + '_long', length: "long" ], file ] } + .filter{ + meta, fasta -> + if ( fasta.isEmpty() ) { log.warn("[nf-core/funcscan] The following sample did not contain contigs longer than ${params.contig_qc_lengththreshold} BGC screening will not be executed: ${meta.id}") } + !fasta.isEmpty() + } + + ch_intermediate_input_short = SEQKIT_SEQ_SHORT.out.fastx + .map{ meta, file -> [ meta + [id: meta.id + '_short', length: "short" ], file ] } + .filter{ + meta, fasta -> + !fasta.isEmpty() + } + + // Now they are split, can annotated together for efficiency + ch_input_for_annotation = ch_intermediate_input_long.mix( ch_intermediate_input_short ) + } else { + ch_input_for_annotation = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } + } - ch_prepped_input_long = SEQKIT_SEQ_LONG.out.fastx - .map{ meta, file -> [ meta + [id: meta.id + '_long', length: "long" ], file ] } - .filter{ - meta, fasta -> - !fasta.isEmpty() - } + /* + ANNOTATION + */ + + // Some tools require annotated FASTAs + if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - ch_prepped_input_short = SEQKIT_SEQ_SHORT.out.fastx - .map{ meta, file -> [ meta + [id: meta.id + '_short', length: "short" ], file ]} + ANNOTATION( ch_input_for_annotation ) + ch_versions = ch_versions.mix( ANNOTATION.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files ) + + ch_new_annotation = ch_input_for_annotation + .join( ANNOTATION.out.faa ) + .join( ANNOTATION.out.gbk ) + + } else { + ch_new_annotation = Channel.empty() + } + + // Mix back the preannotated samples with the newly annotated ones, + // but also have dedicated channel for subworkflows that should only use + // for long contigs + ch_prepped_input = ch_intermediate_input.preannotated + .mix( ch_new_annotation ) + .multiMap { + meta, fasta, faa, gbk -> + fastas: [meta, fasta] + faas: [meta, faa] + gbks: [meta, gbk] + } + + // Generate long contigs only channel only when BGC screening is enabled + if ( params.run_bgc_screening) { + + ch_prepped_input_long = ch_new_annotation .filter{ - meta, fasta -> - !fasta.isEmpty() + meta, fasta, faa, gbk -> + meta.length == "long" + } + .mix(ch_intermediate_input.preannotated) + .map { + meta, fasta, faa, gbk -> + if ( params.run_bgc_screening && meta.length == null ) { log.warn("[nf-core/funcscan] Pre-annotated input will not be filtered to long contigs for BGC screening! Expect long-run times and/or possible crashes if includes very short contigs. Sample: ${meta.id}") } + [meta, fasta, faa, gbk] + } + .multiMap { + meta, fasta, faa, gbk -> + fastas: [meta, fasta] + faas: [meta, faa] + gbks: [meta, gbk] } - ch_prepped_input = ch_prepped_input_long.mix( ch_prepped_input_short ) + } /* TAXONOMIC CLASSIFICATION @@ -123,111 +197,39 @@ workflow FUNCSCAN { // The final subworkflow reports need taxonomic classification. // This can be either on NT or AA level depending on annotation. - // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. if ( params.run_taxa_classification ) { - TAXA_CLASS ( ch_prepped_input ) + + if ( params.run_bgc_screening && !params.run_amp_screening && !params.run_arg_screening ) { + ch_input_for_taxonomy = ch_prepped_input_long.fastas + } else { + ch_input_for_taxonomy = ch_prepped_input.fastas + } + + TAXA_CLASS ( ch_input_for_taxonomy ) ch_versions = ch_versions.mix( TAXA_CLASS.out.versions ) ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy } else { - ch_mmseqs_db = Channel.empty() ch_taxonomy_querydb = Channel.empty() ch_taxonomy_querydb_taxdb = Channel.empty() ch_taxonomy_tsv = Channel.empty() } - /* - ANNOTATION - */ - - // Some tools require annotated FASTAs - // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) - if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - - if ( params.annotation_tool == "prodigal" ) { - PRODIGAL_GFF ( ch_prepped_input, "gff" ) - GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) - GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta ) - GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix( PRODIGAL_GFF.out.versions ) - ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive - - if ( params.save_annotations == true ) { - PRODIGAL_GBK ( ch_prepped_input, "gbk" ) - ch_versions = ch_versions.mix( PRODIGAL_GBK.out.versions ) - ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. - } - } else if ( params.annotation_tool == "pyrodigal" ) { - PYRODIGAL_GFF ( ch_prepped_input, "gff" ) - GUNZIP_PYRODIGAL_FAA ( PYRODIGAL_GFF.out.faa ) - GUNZIP_PYRODIGAL_FNA ( PYRODIGAL_GFF.out.fna ) - GUNZIP_PYRODIGAL_GFF ( PYRODIGAL_GFF.out.annotations ) - ch_versions = ch_versions.mix( PYRODIGAL_GFF.out.versions ) - ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Pyrodigal GBK and GFF output are mutually exclusive - - if ( params.save_annotations == true ) { - PYRODIGAL_GBK ( ch_prepped_input, "gbk" ) - ch_versions = ch_versions.mix( PYRODIGAL_GBK.out.versions ) - ch_annotation_gbk = PYRODIGAL_GBK.out.annotations // Pyrodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. - } - } else if ( params.annotation_tool == "prokka" ) { - PROKKA ( ch_prepped_input, [], [] ) - ch_versions = ch_versions.mix( PROKKA.out.versions ) - ch_annotation_faa = PROKKA.out.faa - ch_annotation_fna = PROKKA.out.fna - ch_annotation_gff = PROKKA.out.gff - ch_annotation_gbk = PROKKA.out.gbk - } else if ( params.annotation_tool == "bakta" ) { - - // BAKTA prepare download - if ( params.annotation_bakta_db_localpath ) { - ch_bakta_db = Channel - .fromPath( params.annotation_bakta_db_localpath ) - .first() - } else { - BAKTA_BAKTADBDOWNLOAD ( ) - ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) - ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) - } - - BAKTA_BAKTA ( ch_prepped_input, ch_bakta_db, [], [] ) - ch_versions = ch_versions.mix( BAKTA_BAKTA.out.versions ) - ch_annotation_faa = BAKTA_BAKTA.out.faa - ch_annotation_fna = BAKTA_BAKTA.out.fna - ch_annotation_gff = BAKTA_BAKTA.out.gff - ch_annotation_gbk = BAKTA_BAKTA.out.gbff - } - - } else { - - ch_annotation_faa = Channel.empty() - ch_annotation_fna = Channel.empty() - ch_annotation_gff = Channel.empty() - ch_annotation_gbk = Channel.empty() - - } - - /* - SCREENING - */ + /////////////// + // SCREENING // + /////////////// /* AMPs */ if ( params.run_amp_screening && !params.run_taxa_classification ) { AMP ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, @@ -236,17 +238,17 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(AMP.out.versions) } else if ( params.run_amp_screening && params.run_taxa_classification ) { AMP ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -259,17 +261,17 @@ workflow FUNCSCAN { if ( params.run_arg_screening && !params.run_taxa_classification ) { if ( params.arg_skip_deeparg ) { ARG ( - ch_prepped_input, + ch_prepped_input.fastas, [], ch_taxonomy_tsv ) } else { ARG ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv @@ -279,28 +281,28 @@ workflow FUNCSCAN { } else if ( params.run_arg_screening && params.run_taxa_classification ) { if ( params.arg_skip_deeparg ) { ARG ( - ch_prepped_input, + ch_prepped_input.fastas, [], ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) } else { ARG ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -312,54 +314,45 @@ workflow FUNCSCAN { BGCs */ if ( params.run_bgc_screening && !params.run_taxa_classification ) { + + ch_filtered_taxonomytsv_for_bgc = ch_taxonomy_tsv.dump(tag: 'ch_taxonomy_tsv_for_bgc.tsv') + BGC ( - ch_prepped_input_long, - ch_annotation_gff - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, - ch_annotation_faa + ch_prepped_input_long.fastas, + ch_prepped_input_long.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_annotation_gbk + ch_prepped_input_long.gbks .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_taxonomy_tsv + ch_filtered_taxonomytsv_for_bgc ) ch_versions = ch_versions.mix( BGC.out.versions ) } else if ( params.run_bgc_screening && params.run_taxa_classification ) { BGC ( - ch_prepped_input, - ch_annotation_gff + ch_prepped_input_long.fastas, + ch_prepped_input_long.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_annotation_faa + ch_prepped_input_long.gbks .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, - ch_annotation_gbk - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -409,9 +402,7 @@ workflow FUNCSCAN { ) ) - if( params.annotation_tool=='prokka' ) { - ch_multiqc_files = ch_multiqc_files.mix( PROKKA.out.txt.collect{it[1]}.ifEmpty([]) ) - } + ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files.collect{it[1]}.ifEmpty([]) ) MULTIQC ( ch_multiqc_files.collect(),