Skip to content

Commit 31ef170

Browse files
committed
Merge branch 'dev'
2 parents dd77fed + f48d876 commit 31ef170

12 files changed

+111
-140
lines changed

bin/multisample_plot.py

+16-55
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/env python3
1+
#! /usr/bin/env python3
22
# /// script
33
# requires-python = ">=3.11"
44
# dependencies = [
@@ -36,8 +36,6 @@
3636
from __future__ import annotations
3737

3838
import argparse
39-
import json
40-
import os
4139
from math import log10
4240
from pathlib import Path
4341

@@ -49,6 +47,8 @@
4947
geom_line,
5048
ggplot,
5149
ggsave,
50+
guide_legend,
51+
guides,
5252
labs,
5353
theme_minimal,
5454
)
@@ -81,13 +81,6 @@ def parse_command_line_args() -> argparse.Namespace:
8181
required=True,
8282
help="Directory to scan for BED files.",
8383
)
84-
parser.add_argument(
85-
"--sample_lookup",
86-
"-s",
87-
type=Path,
88-
required=True,
89-
help="JSON-formatted sample lookup, where the keys are the barcodes, and the values are the sample ID to be associated with each barcode.",
90-
)
9184
parser.add_argument(
9285
"--min_coverage",
9386
"-m",
@@ -104,37 +97,7 @@ def parse_command_line_args() -> argparse.Namespace:
10497
return parser.parse_args()
10598

10699

107-
def read_sample_lookup(file_path: str) -> dict[str, str]:
108-
"""
109-
Reads a JSON file and returns its content.
110-
111-
This function opens the specified JSON file, loads its content into a Python object,
112-
prints the content to the console, and then returns the loaded data.
113-
114-
Parameters:
115-
file_path (str): The path to the JSON file that contains the data to be read.
116-
117-
Returns:
118-
data (dict or list): The content of the JSON file as a Python dictionary or list,
119-
depending on the JSON structure.
120-
121-
Raises:
122-
FileNotFoundError: If the specified file does not exist.
123-
json.JSONDecodeError: If the file is not a valid JSON format or is empty.
124-
125-
Example:
126-
>>> data = read_sample_lookup('path/to/data.json')
127-
>>> print(data)
128-
[ "item1", "item2", "item3" ]
129-
"""
130-
assert Path(
131-
file_path,
132-
).is_file(), f"The provided file path {file_path} does not exist."
133-
with Path(file_path).open("r", encoding="utf8") as file:
134-
return json.load(file)
135-
136-
137-
def accumulate_cov_dfs(directory: str, sample_lookup: dict[str, str]) -> pl.DataFrame:
100+
def accumulate_cov_dfs(directory: str, sample_list: list[str]) -> pl.DataFrame:
138101
"""
139102
Accumulate and concatenate multiple CSV files from a specified directory into a single Polars DataFrame.
140103
@@ -179,28 +142,21 @@ def accumulate_cov_dfs(directory: str, sample_lookup: dict[str, str]) -> pl.Data
179142
directory,
180143
).is_dir(), f"The provided input directory {directory} does not exist."
181144

182-
bed_list = []
183-
bc_list = []
184-
for filename in os.listdir(directory):
185-
f = Path(directory) / Path(filename)
186-
if not Path(f).is_file() and not filename.endswith(".bed"):
187-
continue
188-
barcode = filename.split(".")[0]
189-
bed_list.append(f)
190-
bc_list.append(barcode)
145+
sample_lookup = {
146+
sample_id: Path(directory) / Path(f"{sample_id}.per-base.bed")
147+
for sample_id in sample_list
148+
}
191149

192150
df_list = []
193-
for bed_file, barcode in zip(bed_list, bc_list):
194-
if barcode not in sample_lookup:
195-
continue
151+
for sample_id, bed_file in sample_lookup.items():
196152
bc_df = (
197153
pl.read_csv(
198154
bed_file,
199155
separator="\t",
200156
has_header=False,
201157
new_columns=["chromosome", "start", "stop", "coverage"],
202158
)
203-
.with_columns(sample=pl.lit(sample_lookup[barcode]))
159+
.with_columns(sample=pl.lit(sample_id))
204160
.with_columns(
205161
pl.int_ranges(start=pl.col("start"), end=pl.col("stop")).alias(
206162
"position",
@@ -267,6 +223,7 @@ def plot_log_coverages(
267223
)
268224
+ facet_wrap("~chromosome", scales="free_x")
269225
+ theme_minimal()
226+
+ guides(color=guide_legend(ncol=3))
270227
)
271228

272229

@@ -326,6 +283,7 @@ def plot_coverages(all_barcodes: pl.DataFrame, min_desired_depth: int = 20) -> g
326283
)
327284
+ facet_wrap("~chromosome", scales="free_x")
328285
+ theme_minimal()
286+
+ guides(color=guide_legend(ncol=3))
329287
)
330288

331289

@@ -359,7 +317,10 @@ def main() -> None:
359317
"""
360318
args = parse_command_line_args()
361319
min_desired_depth = args.min_coverage
362-
sample_list = read_sample_lookup(args.sample_lookup)
320+
sample_list = [
321+
path.name.replace(".per-base.bed", "")
322+
for path in Path(args.input_dir).glob("*.per-base.bed")
323+
]
363324
sample_dataframe = accumulate_cov_dfs(args.input_dir, sample_list)
364325

365326
if args.log:

bin/resplice_primers.py

+23-30
Original file line numberDiff line numberDiff line change
@@ -458,13 +458,11 @@ def resolve_primer_names(
458458
# current iteration is handling, and which old primer names should be in which
459459
# order for a join downstream.
460460
new_primer_pairs: list[tuple[str, str]] = []
461-
handled_pairs: list[list[int]] = []
462461
old_primer_pairs: list[tuple[str, str]] = []
463-
combo_ticker = 0
464462

465463
# loop through both primers, where primer1 is from the "deficit primer" list, and
466464
# primer2 is from the "excess primer" list.
467-
for old_fwd_primer, old_rev_primer in all_possible_pairs:
465+
for i, (old_fwd_primer, old_rev_primer) in enumerate(all_possible_pairs):
468466
# pull of the last element, delimited by hyphen, on both primer names
469467
fwd_final_element = old_fwd_primer.split(idx_delim)[idx_position]
470468
rev_final_element = old_rev_primer.split(idx_delim)[idx_position]
@@ -482,30 +480,18 @@ def resolve_primer_names(
482480
)
483481
sys.exit(1)
484482

485-
# figure out which combination of primers is being handled and check
486-
# whether it has already been handled
487-
primer1_index = int(fwd_final_element)
488-
primer2_index = int(rev_final_element)
489-
current_pair = sorted((primer1_index, primer2_index))
490-
if current_pair in handled_pairs:
491-
continue
492-
493-
# now that we know we're working with a previously unhandled pairing, incrememt
494-
# the combo ticker by one
495-
combo_ticker += 1
496-
497483
# use f-strings to construct new names that make the combinations explicit
498-
new_fwd_primer = f"{old_fwd_primer}_splice{combo_ticker}"
499-
new_rev_primer = f"{old_rev_primer}_splice{combo_ticker}"
484+
new_fwd_primer = (
485+
old_fwd_primer.replace(f"-{fwd_final_element}", "") + f"_splice{i + 1}"
486+
)
487+
new_rev_primer = (
488+
old_rev_primer.replace(f"-{rev_final_element}", "") + f"_splice{i + 1}"
489+
)
500490

501491
# continue accumulating old and new primer pair lists
502492
old_primer_pairs.append((old_fwd_primer, old_rev_primer))
503493
new_primer_pairs.append((new_fwd_primer, new_rev_primer))
504494

505-
# now that we know nothing has gone awry, at this pair to the accumulating list
506-
# of handled primer pairs
507-
handled_pairs.append(current_pair)
508-
509495
# flatten the tuples at each position of the pair lists with two comprehensions
510496
# to make it explicit to the reader that forward primers come before reverse primers
511497
# in the flattened list. These comprehensions handle the old primer names.
@@ -565,12 +551,12 @@ def resplice_primers(
565551
logger.debug(
566552
f"Pair of primers within the amplicon {amplicon} detected: {pair_labels}. No resplicing will be needed here, though double check that the necessary forward and reverse suffixes, {fwd_suffix} and {rev_suffix}, are present.",
567553
)
568-
assert any(
569-
fwd_suffix in primer for primer in pair_labels
570-
), f"The forward suffix {fwd_suffix} is missing in the provided primer pairs: {pair_labels}. Aborting."
571-
assert any(
572-
rev_suffix in primer for primer in pair_labels
573-
), f"The reverse suffix {rev_suffix} is missing in the provided primer pairs: {pair_labels}. Aborting."
554+
assert any(fwd_suffix in primer for primer in pair_labels), (
555+
f"The forward suffix {fwd_suffix} is missing in the provided primer pairs: {pair_labels}. Aborting."
556+
)
557+
assert any(rev_suffix in primer for primer in pair_labels), (
558+
f"The reverse suffix {rev_suffix} is missing in the provided primer pairs: {pair_labels}. Aborting."
559+
)
574560

575561
# if so, all is well. Append the primers into the growing list of correct
576562
# dataframes and move onto the next amplicon
@@ -607,8 +593,10 @@ def resplice_primers(
607593
# current design this should be impossible
608594
assert len(old_primer_names) == len(
609595
new_primer_names,
610-
), f"Insufficient number of replacement names ({new_primer_names}) generated \
596+
), (
597+
f"Insufficient number of replacement names ({new_primer_names}) generated \
611598
for partition for amplicon {amplicon}: {primer_df}"
599+
)
612600

613601
# run a join on the old primer names to bring in the new primer names in their
614602
# proper locations
@@ -669,8 +657,13 @@ def finalize_primer_pairings(
669657
for primer in df.select("NAME").to_series().to_list()
670658
if rev_suffix in primer
671659
]
672-
if len(fwd_keepers) > 0 and len(rev_keepers) > 0:
673-
final_frames.append(df)
660+
if len(fwd_keepers) == 0 or len(rev_keepers) == 0:
661+
logger.warning(
662+
"Incorrect splicing occurred for the following sets of primers. The amplicon they are derived from will be skipped:\n{fwd_keepers}\n{rev_keepers}",
663+
)
664+
continue
665+
666+
final_frames.append(df)
674667

675668
return pl.concat(final_frames)
676669

bin/split_primer_combos.py

+3
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ def main() -> None:
104104

105105
for df in bed_dfs:
106106
splicing = df.select("NAME").unique().item()
107+
assert (
108+
len(df) == 2 # noqa: PLR2004
109+
), f"Problematic splicing occurred with {splicing}"
107110
df.write_csv(
108111
file=f"{splicing}.bed",
109112
separator="\t",

main.nf

-6
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,6 @@ workflow {
6767
Channel.fromPath( params.snpEff_config ) :
6868
Channel.empty()
6969

70-
ch_sample_lookup = params.sample_lookup ?
71-
Channel.fromPath( params.sample_lookup ) :
72-
Channel.empty()
73-
7470
// decide whether to run the ont or the illumina workflow
7571
if ( params.platform == "ont" ) {
7672

@@ -79,7 +75,6 @@ workflow {
7975
ch_refseq,
8076
ch_ref_gbk,
8177
ch_snpeff_config,
82-
ch_sample_lookup
8378
)
8479

8580
} else if ( params.platform == "illumina" ) {
@@ -89,7 +84,6 @@ workflow {
8984
ch_refseq,
9085
ch_ref_gbk,
9186
ch_snpeff_config,
92-
ch_sample_lookup
9387
)
9488

9589
} else {

0 commit comments

Comments
 (0)