Skip to content

Commit a10f41a

Browse files
authored
Merge pull request #1128 from nf-core/dev
Dev -> master for 3.13.2 release
2 parents b59e87a + b7480f7 commit a10f41a

File tree

10 files changed

+167
-88
lines changed

10 files changed

+167
-88
lines changed

CHANGELOG.md

+18-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,28 @@
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
44
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
55

6+
## [[3.13.2](https://github.com/nf-core/rnaseq/releases/tag/3.13.2)] - 2023-11-21
7+
8+
### Credits
9+
10+
Special thanks to the following for their contributions to the release:
11+
12+
- [Jonathan Manning](https://github.com/pinin4fjords)
13+
- [Regina Hertfelder Reynolds](https://github.com/RHReynolds)
14+
- [Matthias Zepper](https://github.com/MatthiasZepper)
15+
16+
### Enhancements & fixes
17+
18+
- [PR #1123](https://github.com/nf-core/rnaseq/pull/1123) - Overhaul tximport.r, output length tables
19+
- [PR #1124](https://github.com/nf-core/rnaseq/pull/1124) - Ensure pseudoaligner is set if pseudoalignment is not skipped
20+
- [PR #1126](https://github.com/nf-core/rnaseq/pull/1126) - Pipeline fails if transcript_fasta not provided and `skip_gtf_filter = true`.
21+
- [PR #1127](https://github.com/nf-core/rnaseq/pull/1127) - Enlarge sampling to determine the number of columns in `filter_gtf.py` script.
22+
623
## [[3.13.1](https://github.com/nf-core/rnaseq/releases/tag/3.13.1)] - 2023-11-17
724

825
### Enhancements and fixes
926

10-
- [[PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix
27+
- [PR #1121](https://github.com/nf-core/rnaseq/pull/1121) - Changes for 3.13.1 patch release incl. igenomes star fix
1128

1229
## [[3.13.0](https://github.com/nf-core/rnaseq/releases/tag/3.13.0)] - 2023-11-17
1330

bin/filter_gtf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,14 @@ def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
2323
def tab_delimited(file: str) -> float:
2424
"""Check if file is tab-delimited and return median number of tabs."""
2525
with open(file, "r") as f:
26-
data = f.read(1024)
26+
data = f.read(102400)
2727
return statistics.median(line.count("\t") for line in data.split("\n"))
2828

2929

3030
def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
3131
"""Filter GTF file based on FASTA sequence names."""
3232
if tab_delimited(gtf_in) != 8:
33-
raise ValueError("Invalid GTF file: Expected 8 tab-separated columns.")
33+
raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")
3434

3535
seq_names_in_genome = extract_fasta_seq_names(fasta)
3636
logger.info(f"Extracted chromosome sequence names from {fasta}")

bin/tximport.r

+118-69
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,143 @@
11
#!/usr/bin/env Rscript
22

3-
# Written by Lorena Pantano and released under the MIT license.
3+
# Script for importing and processing transcript-level quantifications.
4+
# Written by Lorena Pantano, later modified by Jonathan Manning, and released
5+
# under the MIT license.
46

7+
# Loading required libraries
58
library(SummarizedExperiment)
69
library(tximport)
710

8-
args = commandArgs(trailingOnly=TRUE)
11+
# Parsing command line arguments
12+
args <- commandArgs(trailingOnly=TRUE)
913
if (length(args) < 4) {
10-
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE)
14+
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>",
15+
call.=FALSE)
1116
}
1217

13-
coldata = args[1]
14-
path = args[2]
15-
sample_name = args[3]
16-
quant_type = args[4]
17-
tx2gene_path = args[5]
18-
19-
prefix = sample_name
20-
21-
info = file.info(tx2gene_path)
22-
if (info$size == 0) {
23-
tx2gene = NULL
24-
} else {
25-
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE)
26-
colnames(rowdata) = c("tx", "gene_id", "gene_name")
27-
tx2gene = rowdata[,1:2]
18+
# Assigning command line arguments to variables
19+
coldata_path <- args[1]
20+
path <- args[2]
21+
prefix <- args[3]
22+
quant_type <- args[4]
23+
tx2gene_path <- args[5]
24+
25+
## Functions
26+
27+
# Build a table from a SummarizedExperiment object
28+
build_table <- function(se.obj, slot) {
29+
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
2830
}
2931

30-
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
31-
fns = list.files(path, pattern = pattern, recursive = T, full.names = T)
32-
names = basename(dirname(fns))
33-
names(fns) = names
34-
35-
if (file.exists(coldata)) {
36-
coldata = read.csv(coldata, sep="\t")
37-
coldata = coldata[match(names, coldata[,1]),]
38-
coldata = cbind(files = fns, coldata)
39-
} else {
40-
message("ColData not available: ", coldata)
41-
coldata = data.frame(files = fns, names = names)
32+
# Write a table to a file with given parameters
33+
write_se_table <- function(params) {
34+
file_name <- paste0(prefix, ".", params$suffix)
35+
write.table(build_table(params$obj, params$slot), file_name,
36+
sep="\t", quote=FALSE, row.names = FALSE)
4237
}
4338

44-
dropInfReps = quant_type == "kallisto"
39+
# Read transcript metadata from a given path
40+
read_transcript_info <- function(tinfo_path){
41+
info <- file.info(tinfo_path)
42+
if (info$size == 0) {
43+
stop("tx2gene file is empty")
44+
}
45+
46+
transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE,
47+
col.names = c("tx", "gene_id", "gene_name"))
4548

46-
txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
47-
rownames(coldata) = coldata[["names"]]
48-
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]]))
49-
if (length(extra) > 0) {
50-
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra))
49+
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
50+
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra))
51+
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
52+
rownames(transcript_info) <- transcript_info[["tx"]]
53+
54+
list(transcript = transcript_info,
55+
gene = unique(transcript_info[,2:3]),
56+
tx2gene = transcript_info[,1:2])
5157
}
52-
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),]
53-
rownames(rowdata) = rowdata[["tx"]]
54-
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]),
55-
colData = DataFrame(coldata),
56-
rowData = rowdata)
57-
if (!is.null(tx2gene)) {
58-
gi = summarizeToGene(txi, tx2gene = tx2gene)
59-
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
60-
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
61-
growdata = unique(rowdata[,2:3])
62-
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
63-
rownames(growdata) = growdata[["tx"]]
64-
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]),
65-
colData = DataFrame(coldata),
66-
rowData = growdata)
67-
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]),
68-
colData = DataFrame(coldata),
69-
rowData = growdata)
70-
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]),
71-
colData = DataFrame(coldata),
72-
rowData = growdata)
58+
59+
# Read and process sample/column data from a given path
60+
read_coldata <- function(coldata_path){
61+
if (file.exists(coldata_path)) {
62+
coldata <- read.csv(coldata_path, sep="\t")
63+
coldata <- coldata[match(names, coldata[,1]),]
64+
coldata <- cbind(files = fns, coldata)
65+
} else {
66+
message("ColData not available: ", coldata_path)
67+
coldata <- data.frame(files = fns, names = names)
68+
}
69+
rownames(coldata) <- coldata[["names"]]
7370
}
7471

75-
build_table = function(se.obj, slot) {
76-
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
72+
# Create a SummarizedExperiment object with given data
73+
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) {
74+
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length),
75+
colData = col_data,
76+
rowData = row_data)
7777
}
7878

79-
if(exists("gse")){
80-
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
81-
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
82-
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
83-
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
84-
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
85-
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
79+
# Main script starts here
80+
81+
# Define pattern for file names based on quantification type
82+
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
83+
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T)
84+
names <- basename(dirname(fns))
85+
names(fns) <- names
86+
dropInfReps <- quant_type == "kallisto"
87+
88+
# Import transcript-level quantifications
89+
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
90+
91+
# Read transcript and sample data
92+
transcript_info <- read_transcript_info(tx2gene_path)
93+
coldata <- read_coldata(coldata_path)
94+
95+
# Create initial SummarizedExperiment object
96+
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]],
97+
DataFrame(coldata), transcript_info$transcript)
98+
99+
# Setting parameters for writing tables
100+
params <- list(
101+
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"),
102+
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"),
103+
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv")
104+
)
105+
106+
# Process gene-level data if tx2gene mapping is available
107+
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) {
108+
tx2gene <- transcript_info$tx2gene
109+
gi <- summarizeToGene(txi, tx2gene = tx2gene)
110+
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
111+
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
112+
113+
gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),]
114+
rownames(gene_info) <- gene_info[["tx"]]
115+
116+
col_data_frame <- DataFrame(coldata)
117+
118+
# Create gene-level SummarizedExperiment objects
119+
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]],
120+
col_data_frame, gene_info)
121+
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]],
122+
col_data_frame, gene_info)
123+
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]],
124+
col_data_frame, gene_info)
125+
126+
params <- c(params, list(
127+
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"),
128+
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"),
129+
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"),
130+
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"),
131+
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"),
132+
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"),
133+
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv")
134+
))
86135
}
87136

88-
write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
89-
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
137+
# Writing tables for each set of parameters
138+
done <- lapply(params, write_se_table)
90139

91-
# Print sessioninfo to standard out
140+
# Output session information and citations
92141
citation("tximeta")
93142
sessionInfo()
94143

conf/modules.config

+1-2
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,6 @@ if (!params.skip_multiqc) {
10871087

10881088
if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'salmon') {
10891089
process {
1090-
10911090
withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:SALMON_QUANT' {
10921091
ext.args = { params.extra_salmon_quant_args ?: '' }
10931092
publishDir = [
@@ -1112,7 +1111,7 @@ if (!params.skip_pseudo_alignment && params.pseudo_aligner == 'kallisto') {
11121111
}
11131112
}
11141113

1115-
if (!params.skip_pseudo_alignment) {
1114+
if (!params.skip_pseudo_alignment && params.pseudo_aligner) {
11161115
process {
11171116
withName: '.*:QUANTIFY_PSEUDO_ALIGNMENT:TX2GENE' {
11181117
publishDir = [

modules/local/tximport/main.nf

+2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ process TXIMPORT {
1616
path "*gene_counts.tsv" , emit: counts_gene
1717
path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled
1818
path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled
19+
path "*gene_lengths.tsv" , emit: lengths_gene
1920
path "*transcript_tpm.tsv" , emit: tpm_transcript
2021
path "*transcript_counts.tsv" , emit: counts_transcript
22+
path "*transcript_lengths.tsv" , emit: lengths_transcript
2123
path "versions.yml" , emit: versions
2224

2325
when:

nextflow.config

+1-1
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,7 @@ manifest {
317317
description = """RNA sequencing analysis pipeline for gene/isoform quantification and extensive quality control."""
318318
mainScript = 'main.nf'
319319
nextflowVersion = '!>=23.04.0'
320-
version = '3.13.1'
320+
version = '3.13.2'
321321
doi = 'https://doi.org/10.5281/zenodo.1400710'
322322
}
323323

subworkflows/local/prepare_genome/main.nf

+3-4
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ workflow PREPARE_GENOME {
4747
rsem_index // directory: /path/to/rsem/index/
4848
salmon_index // directory: /path/to/salmon/index/
4949
kallisto_index // directory: /path/to/kallisto/index/
50-
hisat2_index // directory: /path/to/hisat2/index/
50+
hisat2_index // directory: /path/to/hisat2/index/
5151
bbsplit_index // directory: /path/to/rsem/index/
5252
gencode // boolean: whether the genome is from GENCODE
5353
is_aws_igenome // boolean: whether the genome files are from AWS iGenomes
@@ -139,14 +139,13 @@ workflow PREPARE_GENOME {
139139
} else {
140140
ch_transcript_fasta = Channel.value(file(transcript_fasta))
141141
}
142-
if (gencode) {
142+
if (gencode) {
143143
PREPROCESS_TRANSCRIPTS_FASTA_GENCODE ( ch_transcript_fasta )
144144
ch_transcript_fasta = PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.fasta
145145
ch_versions = ch_versions.mix(PREPROCESS_TRANSCRIPTS_FASTA_GENCODE.out.versions)
146146
}
147147
} else {
148148
ch_transcript_fasta = MAKE_TRANSCRIPTS_FASTA ( ch_fasta, ch_gtf ).transcript_fasta
149-
ch_versions = ch_versions.mix(GTF_FILTER.out.versions)
150149
ch_versions = ch_versions.mix(MAKE_TRANSCRIPTS_FASTA.out.versions)
151150
}
152151

@@ -268,7 +267,7 @@ workflow PREPARE_GENOME {
268267
ch_versions = ch_versions.mix(SALMON_INDEX.out.versions)
269268
}
270269
}
271-
270+
272271
//
273272
// Uncompress Kallisto index or generate from scratch if required
274273
//

subworkflows/local/quantify_pseudo/main.nf

+8-6
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT {
7979
results = ch_pseudo_results // channel: [ val(meta), results_dir ]
8080
multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ]
8181

82-
tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ]
83-
counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ]
84-
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ]
85-
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ]
86-
tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ]
87-
counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ]
82+
tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv
83+
counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv
84+
lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv
85+
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv
86+
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv
87+
tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv
88+
counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv
89+
lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv
8890

8991
merged_gene_rds = SE_GENE.out.rds // path: *.rds
9092
merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds

tower.yml

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ reports:
55
display: "All samples STAR Salmon DESeq2 QC PDF plots"
66
"**/star_salmon/salmon.merged.gene_counts.tsv":
77
display: "All samples STAR Salmon merged gene raw counts"
8+
"**/star_salmon/salmon.merged.gene_lengths.tsv":
9+
display: "All samples STAR Salmon mean transcript length per gene"
10+
"**/star_salmon/salmon.merged.transcript_lengths.tsv":
11+
display: "All samples STAR Salmon transcript lengths"
812
"**/star_salmon/salmon.merged.gene_tpm.tsv":
913
display: "All samples STAR Salmon merged gene TPM counts"
1014
"**/star_salmon/salmon.merged.transcript_counts.tsv":
@@ -15,6 +19,10 @@ reports:
1519
display: "All samples Salmon DESeq2 QC PDF plots"
1620
"**/salmon/salmon.merged.gene_counts.tsv":
1721
display: "All samples Salmon merged gene raw counts"
22+
"**/salmon/salmon.merged.gene_lengths.tsv":
23+
display: "All samples Salmon mean transcript length per gene"
24+
"**/salmon/salmon.merged.transcript_lengths.tsv":
25+
display: "All samples Salmon transcript lengths"
1826
"**/salmon/salmon.merged.gene_tpm.tsv":
1927
display: "All samples Salmon merged gene TPM counts"
2028
"**/salmon/salmon.merged.transcript_counts.tsv":
@@ -25,6 +33,10 @@ reports:
2533
display: "All samples Kallisto DESeq2 QC PDF plots"
2634
"**/kallisto/kallisto.merged.gene_counts.tsv":
2735
display: "All samples Kallisto merged gene raw counts"
36+
"**/kallisto/kallisto.merged.gene_lengths.tsv":
37+
display: "All samples Kallisto mean transcript length per gene"
38+
"**/kallisto/kallisto.merged.transcript_lengths.tsv":
39+
display: "All samples Kallisto transcript lengths"
2840
"**/kallisto/kallisto.merged.gene_tpm.tsv":
2941
display: "All samples Kallisto merged gene TPM counts"
3042
"**/kallisto/kallisto.merged.transcript_counts.tsv":

workflows/rnaseq.nf

+2-3
Original file line numberDiff line numberDiff line change
@@ -816,9 +816,8 @@ workflow RNASEQ {
816816
//
817817
ch_pseudo_multiqc = Channel.empty()
818818
ch_pseudoaligner_pca_multiqc = Channel.empty()
819-
ch_pseudoaligner_clustering_multiqc = Channel.empty()
820-
821-
if (!params.skip_pseudo_alignment) {
819+
ch_pseudoaligner_clustering_multiqc = Channel.empty()
820+
if (!params.skip_pseudo_alignment && params.pseudo_aligner) {
822821

823822
if (params.pseudo_aligner == 'salmon') {
824823
ch_pseudo_index = PREPARE_GENOME.out.salmon_index

0 commit comments

Comments
 (0)