Skip to content

Commit b783452

Browse files
Merge pull request #18 from TheLostLambda/crc
feat(consolidation): cross-replicate consolidation
2 parents 0905f42 + cc25e17 commit b783452

40 files changed

+11042
-125
lines changed

Cargo.toml

+2-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ version = "0.1.0"
44
edition = "2024"
55

66
[workspace]
7-
members = [
7+
members = [ "crates/consolidation",
88
"crates/dimer-builder",
99
"crates/muropeptide",
1010
"crates/nom-miette",
@@ -24,11 +24,10 @@ dead_code = "allow"
2424
[dependencies]
2525
# miette = { version = "7.2.0", features = ["fancy"] }
2626
miette = { git = "https://github.com/TheLostLambda/miette", features = ["fancy"] }
27-
once_cell = "1.19.0"
2827
polychem = { path = "crates/polychem/" }
2928
muropeptide = { path = "crates/muropeptide/" }
3029
smithereens = { path = "crates/smithereens/" }
31-
rustyline = "14.0.0"
30+
rustyline = "15.0.0"
3231
rust_decimal = "1.36.0"
3332

3433
# FIXME: This is just a list of nice libraries, but should eventually be deleted!

crates/consolidation/Cargo.toml

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "consolidation"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies.polars]
7+
version = "0.46.0"
8+
default-features = false
9+
features = [
10+
"abs",
11+
"csv",
12+
"dtype-struct",
13+
"lazy",
14+
"regex",
15+
"round_series",
16+
"strings",
17+
]
18+
19+
[lints]
20+
workspace = true
21+
22+
[dev-dependencies]
23+
insta = "1.42.1"

crates/consolidation/src/lib.rs

+273
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
use std::io::Cursor;
2+
3+
use polars::prelude::*;
4+
5+
// Constants ===========================================================================================================
6+
7+
// NOTE: Z-value for 95% confidence interval
8+
const Z_SCORE_FOR_CI_95: f64 = 1.96;
9+
const OLIGO_STATE_REGEX: &str = r"\|(\d+)";
10+
11+
struct InputColumns;
12+
impl InputColumns {
13+
const ALL: [&str; 5] = [
14+
Self::STRUCTURE,
15+
Self::INTENSITY,
16+
Self::RT,
17+
Self::THEO,
18+
Self::PPM,
19+
];
20+
const STRUCTURE: &str = "Structure";
21+
const INTENSITY: &str = "Consolidated Intensity";
22+
const RT: &str = "Consolidated RT (min)";
23+
const THEO: &str = "Consolidated Theo (Da)";
24+
const PPM: &str = "Consolidated Delta (ppm)";
25+
}
26+
27+
struct TemporaryColumns;
28+
impl TemporaryColumns {
29+
const REPLICATE: &str = "Replicate";
30+
const AVG_INTENSITY: &str = "Avg Intensity";
31+
const CI_INTENSITY: &str = "95% CI Intensity";
32+
}
33+
34+
struct OutputColumns;
35+
impl OutputColumns {
36+
const STRUCTURE: &str = "Structure";
37+
const OLIGO_STATE: &str = "Oligo State";
38+
const AVG_ABUNDANCE: &str = "Average Abundance (%)";
39+
const CI_ABUNDANCE: &str = "95% CI Abundance (%)";
40+
const AVG_RT: &str = "Average RT (min)";
41+
const CI_RT: &str = "95% CI RT (min)";
42+
const THEO: &str = "Theo (Da)";
43+
const PPM: &str = "Absolute Delta (ppm)";
44+
const PRESENT_IN: &str = "Present In";
45+
const REPLICATE_INTENSITY: &str = "Intensity";
46+
const REPLICATE_ABUNDANCE: &str = "Abundance";
47+
}
48+
49+
// Public API ==========================================================================================================
50+
51+
pub type ReplicateNumber = u32;
52+
pub type Replicate = (ReplicateNumber, String);
53+
54+
pub fn consolidate_replicates(replicates: &[Replicate]) -> String {
55+
// SAFETY: Overall this unwrap should be okay, since I also control PGFinder's outputs
56+
load_replicates(replicates)
57+
.map(consolidate)
58+
.and_then(into_csv)
59+
.unwrap_or_default()
60+
}
61+
62+
// Private Types =======================================================================================================
63+
64+
struct ReplicateFrame(Vec<ReplicateNumber>, LazyFrame);
65+
66+
// Private Functions ===================================================================================================
67+
68+
// TODO: Use a fixed-point Decimal type instead of a float, so that we don't have nasty floating-point errors that show
69+
// up in all of the results...
70+
fn load_replicates(replicates: &[Replicate]) -> PolarsResult<ReplicateFrame> {
71+
let replicate_numbers: Vec<_> = replicates.iter().map(|&(n, _)| n).collect();
72+
let lazyframe = concat(
73+
replicates
74+
.iter()
75+
.map(|&(replicate, ref csv)| {
76+
Ok(CsvReader::new(Cursor::new(csv))
77+
.finish()?
78+
.lazy()
79+
.select(InputColumns::ALL.map(col))
80+
.drop_nulls(None)
81+
.with_column(lit(replicate).alias(TemporaryColumns::REPLICATE)))
82+
})
83+
.collect::<PolarsResult<Vec<_>>>()?,
84+
UnionArgs::default(),
85+
);
86+
Ok(ReplicateFrame(replicate_numbers, lazyframe?))
87+
}
88+
89+
// FIXME: Benchmark if taking in the replicate numbers is even worth it... I'm just trying to not break the chain of
90+
// lazy operations here, so I'm using `group_by` instead of just `pivot`...
91+
fn consolidate(ReplicateFrame(replicates, df): ReplicateFrame) -> LazyFrame {
92+
let individual_replicate_columns = df
93+
.clone()
94+
.group_by([InputColumns::STRUCTURE])
95+
.agg(wide_formatted_replicate_columns(&replicates))
96+
.with_columns(replicate_abundance_columns(&replicates));
97+
98+
df.group_by([InputColumns::STRUCTURE])
99+
.agg(consolidated_columns())
100+
.with_column(oligo_state_column(OLIGO_STATE_REGEX))
101+
.with_columns(average_abundance_columns())
102+
.join(
103+
individual_replicate_columns,
104+
[col(OutputColumns::STRUCTURE)],
105+
[col(InputColumns::STRUCTURE)],
106+
JoinArgs::default(),
107+
)
108+
.select(formatted_output_columns(&replicates))
109+
.sort(
110+
[
111+
OutputColumns::OLIGO_STATE,
112+
OutputColumns::AVG_ABUNDANCE,
113+
OutputColumns::THEO,
114+
],
115+
SortMultipleOptions::new().with_order_descending_multi([false, true, false]),
116+
)
117+
}
118+
119+
fn into_csv(df: LazyFrame) -> PolarsResult<String> {
120+
let mut result = Vec::new();
121+
CsvWriter::new(&mut result).finish(&mut df.collect()?)?;
122+
// SAFETY: The `CsvWriter` should always return valid UTF-8
123+
Ok(String::from_utf8(result).unwrap())
124+
}
125+
126+
// ---------------------------------------------------------------------------------------------------------------------
127+
128+
fn wide_formatted_replicate_columns(replicates: &[u32]) -> Vec<Expr> {
129+
[
130+
vec![len().alias(OutputColumns::PRESENT_IN)],
131+
replicates
132+
.iter()
133+
.map(|&n| {
134+
col(InputColumns::INTENSITY)
135+
.filter(col(TemporaryColumns::REPLICATE).eq(n))
136+
.first()
137+
.alias(replicate_column(n, OutputColumns::REPLICATE_INTENSITY))
138+
})
139+
.collect(),
140+
]
141+
.concat()
142+
}
143+
144+
fn replicate_abundance_columns(replicates: &[u32]) -> Vec<Expr> {
145+
replicates
146+
.iter()
147+
.map(|&n| {
148+
let replicate_intensity_column =
149+
replicate_column(n, OutputColumns::REPLICATE_INTENSITY);
150+
intensity_to_abundance(&replicate_intensity_column, &replicate_intensity_column)
151+
.alias(replicate_column(n, OutputColumns::REPLICATE_ABUNDANCE))
152+
})
153+
.collect()
154+
}
155+
156+
fn consolidated_columns() -> [Expr; 6] {
157+
let confidence_interval = |c| lit(Z_SCORE_FOR_CI_95) * col(c).std(1) / len().sqrt();
158+
[
159+
mean(InputColumns::INTENSITY).alias(TemporaryColumns::AVG_INTENSITY),
160+
confidence_interval(InputColumns::INTENSITY).alias(TemporaryColumns::CI_INTENSITY),
161+
mean(InputColumns::RT).alias(OutputColumns::AVG_RT),
162+
confidence_interval(InputColumns::RT).alias(OutputColumns::CI_RT),
163+
col(InputColumns::THEO).first().alias(OutputColumns::THEO),
164+
col(InputColumns::PPM)
165+
.abs()
166+
.mean()
167+
.alias(OutputColumns::PPM),
168+
]
169+
}
170+
171+
fn oligo_state_column(oligo_state_re: &str) -> Expr {
172+
col(OutputColumns::STRUCTURE)
173+
.str()
174+
.extract(lit(oligo_state_re), 1)
175+
.alias(OutputColumns::OLIGO_STATE)
176+
}
177+
178+
fn average_abundance_columns() -> [Expr; 2] {
179+
[
180+
intensity_to_abundance(
181+
TemporaryColumns::AVG_INTENSITY,
182+
TemporaryColumns::AVG_INTENSITY,
183+
)
184+
.alias(OutputColumns::AVG_ABUNDANCE),
185+
intensity_to_abundance(
186+
TemporaryColumns::CI_INTENSITY,
187+
TemporaryColumns::AVG_INTENSITY,
188+
)
189+
.alias(OutputColumns::CI_ABUNDANCE),
190+
]
191+
}
192+
193+
fn formatted_output_columns(replicates: &[u32]) -> Vec<Expr> {
194+
[
195+
vec![
196+
col(OutputColumns::STRUCTURE)
197+
.str()
198+
.replace_all(lit(OLIGO_STATE_REGEX), lit(""), false),
199+
col(OutputColumns::OLIGO_STATE),
200+
col(OutputColumns::AVG_ABUNDANCE).round(3),
201+
col(OutputColumns::CI_ABUNDANCE).round(3),
202+
col(OutputColumns::AVG_RT).round(2),
203+
col(OutputColumns::CI_RT).round(2),
204+
col(OutputColumns::THEO).round(4),
205+
col(OutputColumns::PPM).round(1),
206+
col(OutputColumns::PRESENT_IN),
207+
],
208+
replicates
209+
.iter()
210+
.flat_map(|&n| {
211+
[
212+
col(replicate_column(n, OutputColumns::REPLICATE_INTENSITY)),
213+
col(replicate_column(n, OutputColumns::REPLICATE_ABUNDANCE)).round(3),
214+
]
215+
})
216+
.collect(),
217+
]
218+
.concat()
219+
}
220+
221+
// ---------------------------------------------------------------------------------------------------------------------
222+
223+
fn replicate_column(replicate: u32, column: &str) -> String {
224+
format!("Replicate {replicate} ({column})")
225+
}
226+
227+
fn intensity_to_abundance(column: &str, total_column: &str) -> Expr {
228+
lit(100) * col(column) / sum(total_column)
229+
}
230+
231+
// Unit Tests ==========================================================================================================
232+
233+
#[cfg(test)]
234+
mod tests {
235+
use std::sync::LazyLock;
236+
237+
use insta::assert_snapshot;
238+
239+
use super::*;
240+
241+
static REPLICATES: LazyLock<[Replicate; 3]> = LazyLock::new(|| {
242+
[
243+
(1, include_str!("../tests/data/E. coli WT_01.csv")),
244+
(2, include_str!("../tests/data/E. coli WT_02.csv")),
245+
(3, include_str!("../tests/data/E. coli WT_03.csv")),
246+
]
247+
.map(|(n, c)| (n, c.to_owned()))
248+
});
249+
250+
#[test]
251+
fn test_load_replicates() {
252+
assert_snapshot!(
253+
load_replicates(REPLICATES.as_ref())
254+
.and_then(|ReplicateFrame(_, df)| into_csv(df))
255+
.unwrap()
256+
);
257+
}
258+
259+
#[test]
260+
fn test_consolidate() {
261+
assert_snapshot!(
262+
load_replicates(REPLICATES.as_ref())
263+
.map(consolidate)
264+
.and_then(into_csv)
265+
.unwrap()
266+
);
267+
}
268+
269+
#[test]
270+
fn test_consolidate_replicates() {
271+
assert_snapshot!(consolidate_replicates(REPLICATES.as_ref()));
272+
}
273+
}

0 commit comments

Comments
 (0)