Skip to content

Commit 8ba2862

Browse files
authored
Merge pull request #20 from JoseEspinosa/updates
Add validation for fasta input and tests
2 parents 278d34f + cc50269 commit 8ba2862

22 files changed

+187
-199
lines changed

.github/workflows/ci.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,3 +44,32 @@ jobs:
4444
# Remember that you can parallelise this by using strategy.matrix
4545
run: |
4646
nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
47+
48+
test_fasta:
49+
name: Run pipeline with test data with fasta files in samplesheet
50+
# Only run on push if this is the nf-core dev branch (merged PRs)
51+
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/reportho') }}"
52+
runs-on: ubuntu-latest
53+
strategy:
54+
matrix:
55+
NXF_VER:
56+
- "23.04.0"
57+
- "latest-everything"
58+
steps:
59+
- name: Check out pipeline code
60+
uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4
61+
62+
- name: Install Nextflow
63+
uses: nf-core/setup-nextflow@v2
64+
with:
65+
version: "${{ matrix.NXF_VER }}"
66+
67+
- name: Disk space cleanup
68+
uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
69+
70+
- name: Run pipeline with test data
71+
# TODO nf-core: You can customise CI pipeline run tests as required
72+
# For example: adding multiple test runs with different parameters
73+
# Remember that you can parallelise this by using strategy.matrix
74+
run: |
75+
nextflow run ${GITHUB_WORKSPACE} -profile test_fasta,docker --outdir ./results

README.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,21 +44,20 @@ Steps that follow can be skipped with `--skip_downstream` in batch analysis.
4444
4545
First, prepare a samplesheet with your input data that looks as follows:
4646

47-
`samplesheet.csv`:
48-
49-
```csv
50-
id,query
47+
```csv title="samplesheet_fasta.csv"
48+
id,fasta
5149
BicD2,data/bicd2.fasta
5250
```
5351

54-
or:
52+
or if you know the UniProt ID of the protein you can provide it directly:
5553

56-
```csv
54+
```csv title="samplesheet.csv"
5755
id,query
5856
BicD2,Q8TD16
5957
```
6058

61-
If using the latter format, you must set `--uniprot_query` to true.
59+
> [!NOTE]
60+
> If you provide both a FASTA file and a UniProt ID only the latter will be used.
6261
6362
Now, you can run the pipeline using:
6463

assets/samplesheet_fasta.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
id,fasta
2+
ste2,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste2.fa
3+
ste3,https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/sequences/ste3.fa

assets/schema_input.json

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,22 @@
1717
"type": "string",
1818
"pattern": "^\\S+$",
1919
"errorMessage": "A query must be provided"
20+
},
21+
"fasta": {
22+
"type": "string",
23+
"format": "file-path",
24+
"exists": true,
25+
"pattern": "^\\S+\\.fa(sta)?$",
26+
"errorMessage": "Fasta file must be provided, cannot contain spaces and must have extension '.fa' or '.fasta'"
2027
}
28+
}
29+
},
30+
"anyOf": [
31+
{
32+
"required": ["id", "query"]
2133
},
22-
"required": ["id", "query"]
23-
}
34+
{
35+
"required": ["id", "fasta"]
36+
}
37+
]
2438
}

bin/fetch_oma_by_sequence.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ def main() -> None:
3030

3131
# Find the main isoform
3232
for it in json["targets"]:
33-
if it["is_main_isoform"]:
34-
entry = it
35-
break
33+
if it["is_main_isoform"]:
34+
entry = it
35+
break
3636

3737
# Write exact match status
3838
if json["identified_by"] == "exact match":

conf/test.config

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ params {
2323
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'
2424

2525
// Other parameters
26-
uniprot_query = true
2726
skip_eggnog = true
2827
min_score = 3
2928
skip_iqtree = true

conf/test_fasta.config

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
/*
2+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3+
Nextflow config file for running minimal tests
4+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5+
Defines input files and everything required to run a fast and simple pipeline test.
6+
7+
Use as follows:
8+
nextflow run nf-core/reportho -profile test,<docker/singularity> --outdir <OUTDIR>
9+
10+
----------------------------------------------------------------------------------------
11+
*/
12+
13+
params {
14+
config_profile_name = 'Test profile'
15+
config_profile_description = 'Minimal test dataset to check pipeline function'
16+
17+
// Limit resources so that this can run on GitHub Actions
18+
max_cpus = 2
19+
max_memory = '6.GB'
20+
max_time = '6.h'
21+
22+
// Input data
23+
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet_fasta.csv'
24+
25+
// Other parameters
26+
skip_eggnog = true
27+
min_score = 3
28+
skip_iqtree = true
29+
fastme_bootstrap = 0
30+
}
31+

conf/test_full.config

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ params {
1818
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/reportho/testdata/samplesheet/samplesheet.csv'
1919

2020
// Other parameters
21-
uniprot_query = true
2221
eggnog_path = 'http://eggnog5.embl.de/download/eggnog_5.0/per_tax_level/1/1_members.tsv.gz'
2322
eggnog_idmap_path = "http://eggnog5.embl.de/download/eggnog_5.0/id_mappings/uniprot/latest.Eukaryota.tsv.gz"
2423
min_score = 3

docs/usage.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,29 @@ You will need to create a samplesheet with information about the samples you wou
1818

1919
### Full samplesheet
2020

21-
The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the table below.
21+
The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 2 columns to match those defined in the tables below.
2222

23-
A final samplesheet file may look something like the one below, with `--uniprot_query` enabled:
23+
A final samplesheet file may look something like the one below:
2424

2525
```csv title="samplesheet.csv"
2626
id,query
2727
BicD2,Q8TD16
2828
```
2929

30-
or the one below, otherwise:
30+
or the one below, if you provide the sequence of the protein in FASTA format:
3131

3232
```csv title="samplesheet.csv"
33-
id,query
33+
id,fasta
3434
BicD2,/home/myuser/data/bicd2.fa
3535
```
3636

37-
| Column | Description |
38-
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
39-
| `id` | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. |
40-
| `query` | The query of the user-specified type. If `--uniprot_query` is `true`, it should be a valid Uniprot accession. Otherwise, it should be a valid path to a FASTA file. |
37+
| Column | Description |
38+
| ------- | ------------------------------------------------------------------------------------------------------------------------------------------------- |
39+
| `id` | User-defined identifier. It is used to identify output files for the protein. Can be anything descriptive, as long as it does not contain spaces. |
40+
| `query` | The query of the user-specified type. It should be a valid Uniprot accession. |
41+
| `fasta` | It should be a valid path to a FASTA file. |
4142

42-
An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
43+
An [example Uniprot samplesheet](../assets/samplesheet.csv) and [example FASTA samplesheet](../assets/samplesheet_fasta.csv) has been provided with the pipeline.
4344

4445
## Running the pipeline
4546

lib/nfcore_external_java_deps.jar

Whitespace-only changes.

main.nf

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,17 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_repo
3333
workflow NFCORE_REPORTHO {
3434

3535
take:
36-
samplesheet // channel: samplesheet read in from --input
36+
samplesheet_query // channel: samplesheet read in from --input with query
37+
samplesheet_fasta // channel: samplesheet read in from --input with fasta
3738

3839
main:
3940

4041
//
4142
// WORKFLOW: Run pipeline
4243
//
4344
REPORTHO (
44-
samplesheet
45+
samplesheet_query,
46+
samplesheet_fasta,
4547
)
4648

4749
// emit:
@@ -75,7 +77,8 @@ workflow {
7577
// WORKFLOW: Run main workflow
7678
//
7779
NFCORE_REPORTHO (
78-
PIPELINE_INITIALISATION.out.samplesheet
80+
PIPELINE_INITIALISATION.out.samplesheet_query,
81+
PIPELINE_INITIALISATION.out.samplesheet_fasta,
7982
)
8083

8184
//

modules/local/dump_params.nf

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ process DUMP_PARAMS {
88

99
input:
1010
tuple val(meta), path(exact)
11-
val uniprot_query
1211
val use_structures
1312
val use_centroid
1413
val min_score
@@ -26,7 +25,6 @@ process DUMP_PARAMS {
2625
"""
2726
cat <<- END_PARAMS > params.yml
2827
id: ${meta.id}
29-
uniprot_query: ${uniprot_query}
3028
exact_match: \$(cat $exact)
3129
use_structures: ${use_structures}
3230
use_centroid: ${use_centroid}

modules/local/fetch_sequences_online.nf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ process FETCH_SEQUENCES_ONLINE {
2020
task.ext.when == null || task.ext.when
2121

2222
script:
23-
prefix = task.ext.prefix ?: meta.id
24-
add_query = params.uniprot_query ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
23+
def prefix = task.ext.prefix ?: meta.id
24+
def add_query = query_fasta == [] ? "" : "cat $query_fasta >> ${prefix}_orthologs.fa"
2525
"""
2626
fetch_sequences.py $ids $prefix > ${prefix}_orthologs.fa
2727
$add_query

nextflow.config

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010
params {
1111
// Input options
1212
input = null
13-
uniprot_query = false
1413

1514
// MultiQC options
1615
multiqc_config = null
@@ -200,8 +199,9 @@ profiles {
200199
executor.cpus = 4
201200
executor.memory = 8.GB
202201
}
203-
test { includeConfig 'conf/test.config' }
204-
test_full { includeConfig 'conf/test_full.config' }
202+
test { includeConfig 'conf/test.config' }
203+
test_fasta { includeConfig 'conf/test_fasta.config' }
204+
test_full { includeConfig 'conf/test_full.config' }
205205
}
206206

207207
// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile
@@ -214,7 +214,7 @@ singularity.registry = 'quay.io'
214214

215215
// Nextflow plugins
216216
plugins {
217-
id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
217+
id 'nf-[email protected]' // Validation of pipeline parameters and creation of an input channel from a sample sheet
218218
}
219219

220220
// Export these variables to prevent local Python/R libraries from conflicting with those in the container

nextflow_schema.json

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,6 @@
2323
"help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/reportho/usage#samplesheet-input).",
2424
"fa_icon": "fas fa-file-csv"
2525
},
26-
"uniprot_query": {
27-
"type": "boolean",
28-
"description": "The input contains a Uniprot ID as query.",
29-
"help_text": "If the input file contains a Uniprot ID as query, set this parameter to `true`.",
30-
"fa_icon": "fas fa-database"
31-
},
3226
"outdir": {
3327
"type": "string",
3428
"format": "directory-path",

subworkflows/local/align.nf

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@ workflow ALIGN {
2525
ch_for_filter
2626
)
2727

28-
ch_versions
29-
.mix(FILTER_FASTA.out.versions)
30-
.set { ch_versions }
28+
ch_versions = ch_versions.mix(FILTER_FASTA.out.versions)
3129

3230
CREATE_TCOFFEETEMPLATE(
3331
ch_pdb
@@ -52,9 +50,8 @@ workflow ALIGN {
5250
TCOFFEE_3DALIGN.out.alignment
5351
.set { ch_alignment }
5452

55-
ch_versions
56-
.mix(TCOFFEE_3DALIGN.out.versions)
57-
.set { ch_versions }
53+
ch_versions = ch_versions.mix(TCOFFEE_3DALIGN.out.versions)
54+
5855
}
5956
else {
6057
TCOFFEE_ALIGN (
@@ -67,9 +64,7 @@ workflow ALIGN {
6764
TCOFFEE_ALIGN.out.alignment
6865
.set { ch_alignment }
6966

70-
ch_versions
71-
.mix(TCOFFEE_ALIGN.out.versions)
72-
.set { ch_versions }
67+
ch_versions = ch_versions.mix(TCOFFEE_ALIGN.out.versions)
7368
}
7469

7570
emit:

subworkflows/local/fetch_sequences.nf

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@ include { FETCH_SEQUENCES_ONLINE } from "../../modules/local/fetch_sequences_onl
22

33
workflow FETCH_SEQUENCES {
44
take:
5-
ch_idlist
6-
ch_query_fasta
5+
ch_id_list
6+
ch_query
77

88
main:
9+
ch_id_list
10+
.join(ch_query)
11+
.set { ch_input }
912

10-
ch_input = params.uniprot_query ? ch_idlist.map { it -> [it[0], it[1], []]} : ch_idlist.join(ch_query_fasta)
1113
FETCH_SEQUENCES_ONLINE (
1214
ch_input
1315
)

0 commit comments

Comments
 (0)