New tests for samplesheet (#1856)

maxulysse · web-flow · commit 452bbc2bbc31 · 2025-04-15T08:58:10.000+02:00
## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/sarek/tree/master/.github/CONTRIBUTING.md) - [ ] If necessary, also make a PR on the nf-core/sarek _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core pipelines lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir <OUTDIR>`). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir <OUTDIR>`). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. - [ ] `README.md` is updated (including new tool citations and authors/contributors).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [1817](https://github.com/nf-core/sarek/pull/1817) - Added new contributor
 - [1841](https://github.com/nf-core/sarek/pull/1841) - Add pcr-indel-model parameter for GATK HaplotypeCaller
 - [1848](https://github.com/nf-core/sarek/pull/1848) - Add parameter for setting pixel distance for GATK MarkDuplicates
+- [1856](https://github.com/nf-core/sarek/pull/1856) - Added early failure when more than 1 normal sample per patient is provided for somatic variant calling
 
 ### Changed
 
diff --git a/subworkflows/local/samplesheet_to_channel/main.nf b/subworkflows/local/samplesheet_to_channel/main.nf
@@ -1,40 +1,60 @@
 workflow  SAMPLESHEET_TO_CHANNEL{
 
     take:
-    ch_from_samplesheet             //
-    aligner                         //
-    ascat_alleles                   //
-    ascat_loci                      //
-    ascat_loci_gc                   //
-    ascat_loci_rt                   //
-    bcftools_annotations            //
-    bcftools_annotations_tbi        //
-    bcftools_header_lines           //
-    build_only_index                //
-    dbsnp                           //
-    fasta                           //
-    germline_resource               //
-    intervals                       //
-    joint_germline                  //
-    joint_mutect2                   //
-    known_indels                    //
-    known_snps                      //
-    no_intervals                    //
-    pon                             //
-    sentieon_dnascope_emit_mode     //
-    sentieon_haplotyper_emit_mode   //
-    seq_center                      //
-    seq_platform                    //
-    skip_tools                      //
-    snpeff_cache                    //
-    snpeff_db                       //
-    step                            //
-    tools                           //
-    umi_read_structure              //
-    wes                             //
+    ch_from_samplesheet           // samplesheet
+    aligner                       // String: aligner
+    ascat_alleles                 // Path: ascat alleles
+    ascat_loci                    // Path: ascat loci
+    ascat_loci_gc                 // Path: ascat loci gc
+    ascat_loci_rt                 // Path: ascat loci rt
+    bcftools_annotations          // Path: bcftools annotations
+    bcftools_annotations_tbi      // Path: bcftools annotations tbi
+    bcftools_header_lines         // Path: bcftools header lines
+    build_only_index              // Boolean: build only index
+    dbsnp                         // Path: dbsnp
+    fasta                         // Path: fasta
+    germline_resource             // Path: germline resource
+    intervals                     // Path: intervals
+    joint_germline                // Boolean: joint_germline
+    joint_mutect2                 // Boolean: joint_mutect2
+    known_indels                  // Path: known indels
+    known_snps                    // Path: known snps
+    no_intervals                  // Boolean: no intervals
+    pon                           // Path: pon
+    sentieon_dnascope_emit_mode   // String: sentieon dnascope emit mode
+    sentieon_haplotyper_emit_mode // String: sentieon haplotyper emit mode
+    seq_center                    // String: seq center
+    seq_platform                  // String: seq platform
+    skip_tools                    // Array: skip tools
+    snpeff_cache                  // Path: snpeff cache
+    snpeff_db                     // String: snpeff db
+    step                          // String: step
+    tools                         // Array: tools
+    umi_read_structure            // String: umi read structure
+    wes                           // wes
 
     main:
     ch_from_samplesheet.dump(tag:"ch_from_samplesheet")
+
+    ch_from_samplesheet
+        .map { meta, _fastq_1, _fastq_2, _spring_1, _spring_2, _table, _cram, _crai, _bam, _bai, _vcf, _variantcaller ->
+            // Get only the patient, sample and status fields from the meta map
+            [meta.patient, meta.subMap('sample', 'status')]
+        }
+        .unique()
+        .groupTuple()
+        .map { patient, samples ->
+            // Count samples with status 0 and status 1
+            def status0_count = samples.count { it.status == 0 }
+            def status1_count = samples.count { it.status == 1 }
+
+            // Check the condition and exit with an error if met
+            if (status1_count == 1 && status0_count > 1) {
+                System.err.println("Patient [${patient}] has more than one sample [${status0_count}] with normal status [0] and one sample with tumor status [1].")
+                error("Execution halted due to sample status inconsistency.")
+            }
+        }
+
     input_sample = ch_from_samplesheet.map{ meta, fastq_1, fastq_2, spring_1, spring_2, table, cram, crai, bam, bai, vcf, variantcaller ->
         // generate patient_sample key to group lanes together
         [ meta.patient + meta.sample, [meta, fastq_1, fastq_2, spring_1, spring_2, table, cram, crai, bam, bai, vcf, variantcaller] ]
diff --git a/tests/csv/3.0/recalibrated_somatic_two_normal_one_sample.csv b/tests/csv/3.0/recalibrated_somatic_two_normal_one_sample.csv
@@ -0,0 +1,4 @@
+patient,sex,status,sample,cram,crai
+test,XX,0,sample1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai
+test,XX,0,sample1B,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai
+test,XX,1,sample2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai
diff --git a/tests/default.nf.test b/tests/default.nf.test
@@ -49,21 +49,4 @@ nextflow_pipeline {
             )
         }
     }
-
-    test("-profile test --input tests/csv/3.0/sample_with_space.csv") {
-
-        when {
-            params {
-                input = "${projectDir}/tests/csv/3.0/sample_with_space.csv"
-                outdir = "$outputDir"
-            }
-        }
-
-        then {
-            assertAll(
-                { assert workflow.failed},
-                { assert workflow.stderr.toString().contains("Sample ID must be provided, cannot contain spaces and must be a string value") }
-            )
-        }
-    }
 }
diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap
@@ -324,8 +324,8 @@
         ],
         "meta": {
             "nf-test": "0.9.2",
-            "nextflow": "24.10.4"
+            "nextflow": "25.02.1"
         },
-        "timestamp": "2025-03-10T11:13:17.000556982"
+        "timestamp": "2025-04-02T15:44:44.567674265"
     }
 }
diff --git a/tests/samplesheets.nf.test b/tests/samplesheets.nf.test
@@ -0,0 +1,47 @@
+nextflow_pipeline {
+
+    name "Test pipeline"
+    script "../main.nf"
+    tag "pipeline"
+    tag "pipeline_sarek"
+    tag "cpu"
+
+    test("-profile test --input tests/csv/3.0/sample_with_space.csv") {
+        when {
+            params {
+                input = "${projectDir}/tests/csv/3.0/sample_with_space.csv"
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.failed},
+                { assert snapshot(
+                    workflow.stderr.toString().split(",")[0..1,3..5]
+                ).match() }
+            )
+        }
+    }
+
+    test("-profile test --step variant_calling --input tests/csv/3.0/recalibrated_somatic_two_normal_one_sample.csv") {
+
+        when {
+            params {
+                modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
+                input = "${projectDir}/tests/csv/3.0/recalibrated_somatic_two_normal_one_sample.csv"
+                outdir = "$outputDir"
+                step = "variant_calling"
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.failed},
+                { assert snapshot(
+                    workflow.stderr.toString().split(",")[0]
+                ).match() }
+            )
+        }
+    }
+}
diff --git a/tests/samplesheets.nf.test.snap b/tests/samplesheets.nf.test.snap
@@ -0,0 +1,28 @@
+{
+    "-profile test --step variant_calling --input tests/csv/3.0/recalibrated_somatic_two_normal_one_sample.csv": {
+        "content": [
+            "[Patient [test] has more than one sample [2] with normal status [0] and one sample with tumor status [1].]"            
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-04-03T17:48:43.052129562"
+    },
+    "-profile test --input tests/csv/3.0/sample_with_space.csv": {
+        "content": [
+            [
+                "[\u001b[0;31mThe following invalid input values have been detected:",
+                " ",
+                " \t-> Entry 2: Error for field 'sample' (test 2): \"test 2\" does not match regular expression [^\\S+$] (Sample ID must be provided",
+                " cannot contain spaces and must be a string value)",
+                " \u001b[0m"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-04-03T12:12:16.909966356"
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -49,21 +49,4 @@ nextflow_pipeline {`
`49`	`49`	`)`
`50`	`50`	`}`
`51`	`51`	`}`
`52`		`-`
`53`		`- test("-profile test --input tests/csv/3.0/sample_with_space.csv") {`
`54`		`-`
`55`		`- when {`
`56`		`- params {`
`57`		`- input = "${projectDir}/tests/csv/3.0/sample_with_space.csv"`
`58`		`- outdir = "$outputDir"`
`59`		`- }`
`60`		`- }`
`61`		`-`
`62`		`- then {`
`63`		`- assertAll(`
`64`		`- { assert workflow.failed},`
`65`		`- { assert workflow.stderr.toString().contains("Sample ID must be provided, cannot contain spaces and must be a string value") }`
`66`		`- )`
`67`		`- }`
`68`		`- }`
`69`	`52`	`}`
Original file line number	Diff line number	Diff line change
`@@ -324,8 +324,8 @@`
`324`	`324`	`],`
`325`	`325`	`"meta": {`
`326`	`326`	`"nf-test": "0.9.2",`
`327`		`- "nextflow": "24.10.4"`
	`327`	`+ "nextflow": "25.02.1"`
`328`	`328`	`},`
`329`		`- "timestamp": "2025-03-10T11:13:17.000556982"`
	`329`	`+ "timestamp": "2025-04-02T15:44:44.567674265"`
`330`	`330`	`}`
`331`	`331`	`}`