Skip to content

Commit e078b1a

Browse files
sschmeierjohanneskoester
authored andcommitted
QC (#14)
* Added qc ruleset. Do QC based on rseqc and multiqc. * Changed qc.smk to make rules not dependent on touched files but real outputs; improved readability. * Working but still with custom script for convreting gtf to bed12 * Fixed multiqc rule to include rseqc_junction_annotation logs. * Updated qc.smk to make it standardized.; removed multiqc env; removed scritps/gtf2bed.pl; added scripts/gtf2bed.py for converting gtf to bed12. * Reworked gtf2bed.py tp make it snakemake centric. * Update rseqc.yaml * Update gffutils.yaml * Update rseqc.yaml * Use output file names in script. * Use context for file obj.
1 parent c0a1740 commit e078b1a

File tree

6 files changed

+195
-1
lines changed

6 files changed

+195
-1
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ This workflow performs a differential expression analysis with STAR and Deseq2.
88
## Authors
99

1010
* Johannes Köster (@johanneskoester), https://koesterlab.github.io
11+
* Sebastian Schmeier (@sschmeier), https://sschmeier.com
1112

1213
## Usage
1314

Snakefile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ rule all:
2424
expand(["results/diffexp/{contrast}.diffexp.tsv",
2525
"results/diffexp/{contrast}.ma-plot.svg"],
2626
contrast=config["diffexp"]["contrasts"]),
27-
"results/pca.svg"
27+
"results/pca.svg",
28+
"qc/multiqc_report.html"
2829

2930

3031
##### setup singularity #####
@@ -45,3 +46,4 @@ include: "rules/common.smk"
4546
include: "rules/trim.smk"
4647
include: "rules/align.smk"
4748
include: "rules/diffexp.smk"
49+
include: "rules/qc.smk"

envs/gffutils.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- bioconda::gffutils=0.9

envs/rseqc.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
channels:
2+
- conda-forge
3+
- bioconda
4+
dependencies:
5+
- bioconda::rseqc=2.6.4

rules/qc.smk

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
## RSEQC
2+
3+
rule rseqc_gtf2bed:
4+
input:
5+
config["ref"]["annotation"]
6+
output:
7+
bed="qc/rseqc/annotation.bed",
8+
db=temp("qc/rseqc/annotation.db")
9+
log:
10+
"logs/rseqc_gtf2bed.log"
11+
conda:
12+
"../envs/gffutils.yaml"
13+
script:
14+
"../scripts/gtf2bed.py"
15+
16+
17+
rule rseqc_junction_annotation:
18+
input:
19+
bam="star/{sample}-{unit}/Aligned.out.bam",
20+
bed="qc/rseqc/annotation.bed"
21+
output:
22+
"qc/rseqc/{sample}-{unit}.junctionanno.junction.bed"
23+
priority: 1
24+
log:
25+
"logs/rseqc/rseqc_junction_annotation/{sample}-{unit}.log"
26+
params:
27+
extra=r"-q 255", # STAR uses 255 as a score for unique mappers
28+
prefix="qc/rseqc/{sample}-{unit}.junctionanno"
29+
conda:
30+
"../envs/rseqc.yaml"
31+
shell:
32+
"junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
33+
"> {log[0]} 2>&1"
34+
35+
36+
rule rseqc_junction_saturation:
37+
input:
38+
bam="star/{sample}-{unit}/Aligned.out.bam",
39+
bed="qc/rseqc/annotation.bed"
40+
output:
41+
"qc/rseqc/{sample}-{unit}.junctionsat.junctionSaturation_plot.pdf"
42+
priority: 1
43+
log:
44+
"logs/rseqc/rseqc_junction_saturation/{sample}-{unit}.log"
45+
params:
46+
extra=r"-q 255",
47+
prefix="qc/rseqc/{sample}-{unit}.junctionsat"
48+
conda:
49+
"../envs/rseqc.yaml"
50+
shell:
51+
"junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
52+
"> {log} 2>&1"
53+
54+
55+
rule rseqc_stat:
56+
input:
57+
"star/{sample}-{unit}/Aligned.out.bam",
58+
output:
59+
"qc/rseqc/{sample}-{unit}.stats.txt"
60+
priority: 1
61+
log:
62+
"logs/rseqc/rseqc_stat/{sample}-{unit}.log"
63+
conda:
64+
"../envs/rseqc.yaml"
65+
shell:
66+
"bam_stat.py -i {input} > {output} 2> {log}"
67+
68+
69+
rule rseqc_infer:
70+
input:
71+
bam="star/{sample}-{unit}/Aligned.out.bam",
72+
bed="qc/rseqc/annotation.bed"
73+
output:
74+
"qc/rseqc/{sample}-{unit}.infer_experiment.txt"
75+
priority: 1
76+
log:
77+
"logs/rseqc/rseqc_infer/{sample}-{unit}.log"
78+
conda:
79+
"../envs/rseqc.yaml"
80+
shell:
81+
"infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
82+
83+
84+
rule rseqc_innerdis:
85+
input:
86+
bam="star/{sample}-{unit}/Aligned.out.bam",
87+
bed="qc/rseqc/annotation.bed"
88+
output:
89+
"qc/rseqc/{sample}-{unit}.inner_distance_freq.inner_distance.txt"
90+
priority: 1
91+
log:
92+
"logs/rseqc/rseqc_innerdis/{sample}-{unit}.log"
93+
params:
94+
prefix="qc/rseqc/{sample}-{unit}.inner_distance_freq"
95+
conda:
96+
"../envs/rseqc.yaml"
97+
shell:
98+
"inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"
99+
100+
101+
rule rseqc_readdis:
102+
input:
103+
bam="star/{sample}-{unit}/Aligned.out.bam",
104+
bed="qc/rseqc/annotation.bed"
105+
output:
106+
"qc/rseqc/{sample}-{unit}.readdistribution.txt"
107+
priority: 1
108+
log:
109+
"logs/rseqc/rseqc_readdis/{sample}-{unit}.log"
110+
conda:
111+
"../envs/rseqc.yaml"
112+
shell:
113+
"read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"
114+
115+
116+
rule rseqc_readdup:
117+
input:
118+
"star/{sample}-{unit}/Aligned.out.bam"
119+
output:
120+
"qc/rseqc/{sample}-{unit}.readdup.DupRate_plot.pdf"
121+
priority: 1
122+
log:
123+
"logs/rseqc/rseqc_readdup/{sample}-{unit}.log"
124+
params:
125+
prefix="qc/rseqc/{sample}-{unit}.readdup"
126+
conda:
127+
"../envs/rseqc.yaml"
128+
shell:
129+
"read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"
130+
131+
132+
rule rseqc_readgc:
133+
input:
134+
"star/{sample}-{unit}/Aligned.out.bam"
135+
output:
136+
"qc/rseqc/{sample}-{unit}.readgc.GC_plot.pdf"
137+
priority: 1
138+
log:
139+
"logs/rseqc/rseqc_readgc/{sample}-{unit}.log"
140+
params:
141+
prefix="qc/rseqc/{sample}-{unit}.readgc"
142+
conda:
143+
"../envs/rseqc.yaml"
144+
shell:
145+
"read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"
146+
147+
148+
rule multiqc:
149+
input:
150+
expand("star/{unit.sample}-{unit.unit}/Aligned.out.bam", unit=units.itertuples()),
151+
expand("qc/rseqc/{unit.sample}-{unit.unit}.junctionanno.junction.bed", unit=units.itertuples()),
152+
expand("qc/rseqc/{unit.sample}-{unit.unit}.junctionsat.junctionSaturation_plot.pdf", unit=units.itertuples()),
153+
expand("qc/rseqc/{unit.sample}-{unit.unit}.infer_experiment.txt", unit=units.itertuples()),
154+
expand("qc/rseqc/{unit.sample}-{unit.unit}.stats.txt", unit=units.itertuples()),
155+
expand("qc/rseqc/{unit.sample}-{unit.unit}.inner_distance_freq.inner_distance.txt", unit=units.itertuples()),
156+
expand("qc/rseqc/{unit.sample}-{unit.unit}.readdistribution.txt", unit=units.itertuples()),
157+
expand("qc/rseqc/{unit.sample}-{unit.unit}.readdup.DupRate_plot.pdf", unit=units.itertuples()),
158+
expand("qc/rseqc/{unit.sample}-{unit.unit}.readgc.GC_plot.pdf", unit=units.itertuples()),
159+
expand("logs/rseqc/rseqc_junction_annotation/{unit.sample}-{unit.unit}.log", unit=units.itertuples())
160+
output:
161+
"qc/multiqc_report.html"
162+
log:
163+
"logs/multiqc.log"
164+
wrapper:
165+
"0.31.1/bio/multiqc"

scripts/gtf2bed.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import gffutils
2+
3+
db = gffutils.create_db(snakemake.input[0],
4+
dbfn=snakemake.output.db,
5+
force=True,
6+
keep_order=True,
7+
merge_strategy='merge',
8+
sort_attribute_values=True,
9+
disable_infer_genes=True,
10+
disable_infer_transcripts=True)
11+
12+
with open(snakemake.output.bed, 'w') as outfileobj:
13+
for tx in db.features_of_type('transcript', order_by='start'):
14+
bed = [s.strip() for s in db.bed12(tx).split('\t')]
15+
bed[3] = tx.id
16+
outfileobj.write('{}\n'.format('\t'.join(bed)))

0 commit comments

Comments
 (0)