Skip to content

Commit 0c5e51b

Browse files
committed
refactor complete
1 parent a2f4c2d commit 0c5e51b

File tree

5 files changed

+494
-0
lines changed

5 files changed

+494
-0
lines changed

.gitignore

+22
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,25 @@ cython_debug/
158158
# and can be added to the global gitignore or merged into this file. For a more nuclear
159159
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
160160
#.idea/
161+
162+
# nextflow stuff
163+
.nextflow/
164+
work/
165+
results/
166+
167+
.nextflow.log*\
168+
trace.txt*
169+
report.html.*
170+
timeline.html*
171+
report.html*
172+
*.dot
173+
*.png.*
174+
175+
# macos stuff
176+
._*
177+
.DS_Store
178+
179+
# bioinformatics files
180+
*.fa*
181+
*.gtf
182+
*.bam*

.pre-commit-config.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v2.3.0
4+
hooks:
5+
- id: check-docstring-first
6+
- id: check-yaml
7+
- id: check-toml
8+
- id: end-of-file-fixer
9+
- id: trailing-whitespace

main.nf

+248
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
#!/usr/bin/env nextflow
2+
3+
nextflow.enable.dsl = 2
4+
5+
6+
7+
// WORKFLOW SPECIFICATION
8+
// --------------------------------------------------------------- //
9+
workflow {
10+
11+
12+
// make sure the user provided inputs that exist
13+
assert params.pb_fastq : "Please provide a PacBio HiFi CCS FASTQ.gz file with the --pb_fastq argument."
14+
assert file(params.pb_fastq).exists : "Provided path to PacBio FASTQ does not exist."
15+
assert params.ont_fastq : "Please provide a Oxford Nanopore FASTQ.gz file file with the --ont_fastq argument."
16+
assert file(params.ont_fastq).exists : "Provided path to Nanopore FASTQ does not exist."
17+
assert params.ref_fasta : "Please provide a reference FASTA with the --ref_fasta argument."
18+
assert file(params.ref_fasta).exists : "Provided path to reference FASTA does not exist."
19+
20+
21+
// input channels
22+
ch_pb_reads = Channel
23+
.fromPath ( params.pb_fastq )
24+
.splitFastq ( by: params.split_max )
25+
.map { fastq -> tuple( file(fastq), file(fastq).getBaseName(), "pacbio" ) }
26+
27+
ch_ont_reads = Channel
28+
.fromPath ( params.ont_fastq )
29+
.splitFastq ( by: params.split_max )
30+
.map { fastq -> tuple( file(fastq), file(fastq).getBaseName(), "ont" ) }
31+
32+
ch_ref = Channel
33+
.fromPath ( params.ref_fasta )
34+
35+
ch_desired_regions = Channel
36+
.fromPath ( params.desired_regions )
37+
.splitCsv( header: true, sep = "\t" )
38+
.map { row -> tuple( row.samtools_expression, row.file_label, row.description ) }
39+
40+
41+
// Workflow steps
42+
process MAP_TO_REF (
43+
ch_pb_reads
44+
.mix ( ch_ont_reads ),
45+
ch_ref
46+
)
47+
48+
process EXTRACT_DESIRED_REGIONS (
49+
MAP_TO_REF.out,
50+
ch_desired_regions
51+
)
52+
53+
process MERGE_PACBIO_FASTQS (
54+
MAP_AND_EXTRACT.out
55+
.filter { x[2] == "pacbio" }
56+
.groupTuple( by: [ 1, 2, 3] )
57+
)
58+
59+
process MERGE_ONT_FASTQS (
60+
MAP_AND_EXTRACT.out
61+
.filter { x[2] == "ont" }
62+
.groupTuple( by: [ 1, 2, 3] )
63+
)
64+
65+
process RUN_HIFIASM (
66+
MERGE_PACBIO_FASTQS.out,
67+
MERGE_ONT_FASTQS.out
68+
)
69+
70+
process CONVERT_CONFIGS_TO_FASTA (
71+
RUN_HIFIASM.out
72+
)
73+
74+
}
75+
// --------------------------------------------------------------- //
76+
77+
78+
79+
// DERIVATIVE PARAMETER SPECIFICATION
80+
// --------------------------------------------------------------- //
81+
// Additional parameters that are derived from parameters set in nextflow.config
82+
83+
params.extracted = params.results + "/01_extracted_regions"
84+
params.assembly = params.results "/02_hifiasm_assembly"
85+
86+
// --------------------------------------------------------------- //
87+
88+
89+
90+
91+
// PROCESS SPECIFICATION
92+
// --------------------------------------------------------------- //
93+
94+
process MAP_TO_REF {
95+
96+
/* */
97+
98+
tag "${basename}, ${platform}"
99+
label "map_and_extract"
100+
101+
cpus 8
102+
103+
input:
104+
tuple path(fastq), val(basename), val(platform)
105+
each path(ref_fasta)
106+
107+
output:
108+
path "*.bam"
109+
110+
script:
111+
minimap2_preset = platform == "pacbio" ? "map-hifi" : "map-ont"
112+
"""
113+
minimap2 -t 6 -L --eqx -ax ${minimap2_preset} \
114+
${ref_fasta} \
115+
${fastq} \
116+
| samtools view -Sbt ${ref_fasta} \
117+
| samtools sort - -o ${basename}_{platform}.bam
118+
"""
119+
120+
}
121+
122+
process EXTRACT_DESIRED_REGIONS {
123+
124+
/* */
125+
126+
tag "${basename}, ${platform}, ${file_label}"
127+
label "map_and_extract"
128+
129+
input:
130+
each path(bam)
131+
tuple val(expression), val(file_label), val(description)
132+
133+
output:
134+
tuple path("${basename}_${platform}_${file_label}.fastq.gz"), val(basename), val(platform), val(file_label)
135+
136+
script:
137+
basename = file(bam).getSimpleName().split("_")[0]
138+
platform = file(bam).getSimpleName().split("_")[1]
139+
"""
140+
samtools index ${bam}
141+
samtools view -b ${bam} ${expression} \
142+
| samtools fastq - \
143+
| reformat.sh qin=33 int=f in=stdin.fq \
144+
out=${basename}_${platform}_${file_label}.fastq.gz
145+
"""
146+
147+
}
148+
149+
process MERGE_PACBIO_FASTQS {
150+
151+
/* */
152+
153+
tag "${basename}, ${platform}, ${file_label}"
154+
publishDir params.extracted, mode: 'copy', overwrite: true
155+
156+
cpus 10
157+
158+
input:
159+
tuple path("to_merge/*"), val(basename), val(platform), val(file_label)
160+
161+
output:
162+
tuple path("${basename}_${platform}_${file_label}.fastq.gz"), val(basename), val(platform), val(file_label)
163+
164+
script:
165+
"""
166+
seqkit scat \
167+
--threads ${task.cpus} \
168+
--find-only \
169+
--out-format fastq
170+
to_merge/ | gzip -c > ${basename}_${platform}_${file_label}.fastq.gz
171+
"""
172+
173+
}
174+
175+
process MERGE_ONT_FASTQS {
176+
177+
/* */
178+
179+
tag "${basename}, ${platform}, ${file_label}"
180+
publishDir params.extracted, mode: 'copy', overwrite: true
181+
182+
cpus 10
183+
184+
input:
185+
tuple path("to_merge/*"), val(basename), val(platform), val(file_label)
186+
187+
output:
188+
tuple path("${basename}_${platform}_${file_label}.fastq.gz"), val(basename), val(platform), val(file_label)
189+
190+
script:
191+
"""
192+
seqkit scat \
193+
--threads ${task.cpus} \
194+
--find-only \
195+
--out-format fastq
196+
to_merge/ | gzip -c > ${basename}_${platform}_${file_label}.fastq.gz
197+
"""
198+
199+
}
200+
201+
process RUN_HIFIASM {
202+
203+
/* */
204+
205+
tag "${basename}, ${file_label}"
206+
publishDir "${params.assembly}/${basename}_${file_label}", mode: 'copy', overwrite: true
207+
208+
cpus 8
209+
210+
input:
211+
tuple path(pb_fastq), val(basename), val(platform), val(file_label)
212+
tuple path(ont_fastq), val(basename), val(platform), val(file_label)
213+
214+
output:
215+
tuple path("*"), val(basename), val(platform), val(file_label)
216+
217+
script:
218+
"""
219+
hifiasm -o ${basename}_${file_label} -t6 --ul ${ont_fastq} $${pb_fastq}
220+
"""
221+
222+
}
223+
224+
process CONVERT_CONFIGS_TO_FASTA {
225+
226+
/* */
227+
228+
tag "${basename}, ${platform}, ${file_label}"
229+
label "map_and_extract"
230+
publishDir params.assembly, mode: 'copy', overwrite: true
231+
232+
cpus 8
233+
234+
input:
235+
tuple path("hifiasm_files/*"), val(basename), val(platform), val(file_label)
236+
237+
output:
238+
path "${basename}_${file_label}.p_contigs.fasta"
239+
240+
shell:
241+
'''
242+
awk '/^S/{print ">"\$2"\n"\$3}' hifiasm_files/!{basename}_!{file_label}.bp.p_ctg.gfa \
243+
| fold > ${basename}_${file_label}.p_contigs.fasta
244+
'''
245+
246+
}
247+
248+
// --------------------------------------------------------------- //

0 commit comments

Comments
 (0)