Skip to content

Commit 0905f42

Browse files
Merge pull request #17 from TheLostLambda/convert-to-smiles
Add MVP SMILES generation to `muropeptide`
2 parents 2e56f98 + 76e6673 commit 0905f42

26 files changed

+3577
-259
lines changed

.github/workflows/ci.yml

+11-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ jobs:
1717
strategy:
1818
matrix:
1919
os: [ubuntu-latest, macos-latest, windows-latest]
20-
toolchain: [stable, beta, nightly]
20+
# toolchain: [stable, beta, nightly]
21+
toolchain: [nightly]
2122
steps:
2223
- uses: actions/checkout@v4
2324
- uses: Swatinem/rust-cache@v2
@@ -26,7 +27,10 @@ jobs:
2627
with:
2728
toolchain: ${{ matrix.toolchain }}
2829

29-
- run: just test
30+
- run: cargo test --workspace
31+
env:
32+
RUSTFLAGS: -Zmacro-backtrace
33+
# - run: just test
3034
- run: just bench
3135

3236
check_wasm_build:
@@ -36,7 +40,7 @@ jobs:
3640
- uses: actions/checkout@v4
3741
- uses: Swatinem/rust-cache@v2
3842
- uses: extractions/setup-just@v2
39-
- uses: dtolnay/rust-toolchain@stable
43+
- uses: dtolnay/rust-toolchain@beta
4044
with:
4145
target: wasm32-unknown-unknown
4246

@@ -49,7 +53,10 @@ jobs:
4953
- uses: actions/checkout@v4
5054
- uses: Swatinem/rust-cache@v2
5155
- uses: extractions/setup-just@v2
52-
- uses: dtolnay/rust-toolchain@stable
56+
- uses: dtolnay/rust-toolchain@beta
57+
# TODO: Remove this when moving back to `stable`
58+
with:
59+
components: rustfmt, clippy
5360

5461
- run: just lint
5562

crates/muropeptide/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ edition = "2024"
66
[dependencies]
77
itertools = "0.13.0"
88
# miette = "7.2.0"
9+
knuffel = { git = "https://github.com/TheLostLambda/knus.git", branch = "tll-fork" }
910
miette = { git = "https://github.com/TheLostLambda/miette" }
1011
nom = "7.1.3"
1112
nom-miette = { path = "../nom-miette" }
@@ -14,6 +15,8 @@ smithereens = { path = "../smithereens" }
1415
thiserror = "1.0.59"
1516
# FIXME: Move back to `dev-dependencies`
1617
once_cell = "1.19.0"
18+
ahash = "0.8.11"
19+
regex = "1.11.1"
1720

1821
[dev-dependencies]
1922
divan = "0.1.14"
@@ -22,6 +25,7 @@ insta = { version = "1.38.0", features = ["redactions", "ron"] }
2225
miette = { git = "https://github.com/TheLostLambda/miette", features = ["fancy"] }
2326
rust_decimal = "1.36.0"
2427
rust_decimal_macros = "1.34.2"
28+
serde = { version = "1.0.217", features = ["derive"] }
2529

2630
[[bench]]
2731
name = "api"

crates/muropeptide/data/polymer_database.kdl

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ modifications {
5050
Poly "Wall Polymer Linkage" {
5151
targeting "Hydroxyl" at="6-Position"
5252
lost "H"
53-
gained "PO3"
53+
gained "H2PO3"
5454
}
5555
DeAc "De-N-Acetylation" {
5656
targeting "Acetyl" at="Secondary Amide"
+243
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
residues {
2+
types {
3+
Monosaccharide {
4+
bond-sites "Gly" donor="(O)$" acceptor="^O()"
5+
6+
// FIXME: I should make this like `bond-sites`, where only a capture group is replaced — that way I can give less
7+
// verbose `with=` strings...
8+
modification "Ac" replace=r"\)CO\)\[C@H" with=")COC(=O)C)[C@H"
9+
modification "DeAc" replace=r"NC\(=O\)C" with="N"
10+
modification "Poly" replace=r"\)CO\)\[C@H" with=")COP(O)(O)=O)[C@H"
11+
modification "Red" {
12+
replace "1" with=""
13+
// FIXME: With only two non-H substituents, the carbon is no longer chiral!
14+
replace r"\[C@@H\]O$" with="CO"
15+
}
16+
}
17+
18+
AminoAcid {
19+
// Template SMILES — the `sidechain` is inserted into this template (removing the `<>`)
20+
isomer "L" "N[C@@H](<sidechain>)C(O)=O"
21+
isomer "D" "N[C@H](<sidechain>)C(O)=O"
22+
23+
// Bonding site regex — the match is replaced by a link to the acceptor residue
24+
// NOTE: I really wish I could write something like `N[C@@H](<sidechain>)C(=O)O` for my amino acids, and then have
25+
// this rule just be something clean like `(O)$`, but that would ultimately generate something like `C(=O)1`,
26+
// which seems reasonable, and seems to be accepted perfectly fine by most SMILES tools, but technically
27+
// OpenSMILES demands that those "ring closure" `rnum`s come *before* any branches, so it must be `C1(=O)` or
28+
// `C1=O` instead. Aggravating, dumb, stupid even, but I'll obey for the sake of the widest possible compatibility
29+
bond-sites "Pep" donor=r"(\(O\))=O$" acceptor="^N()"
30+
// FIXME: This remains super dumb and is the fault of known shortcomings in the polymer database KDL format...
31+
bond-sites "Link" donor=r"(\(O\))=O$" acceptor="^N()"
32+
// FIXME: This is a sign that things are starting to go wrong... I should be using the functional group
33+
// abstraction!
34+
bond-sites "Stem" acceptor="^N()"
35+
bond-sites "CToN" donor=r"(\(O\))=O$"
36+
bond-sites "NToC" acceptor="^N()"
37+
38+
// Implicit stereoisomer selection rules — isomers are picked in order of definition unless a rule like
39+
// `stem` or `lateral` matches, then that `use=` isomer is picked
40+
stem 4 5 use="D"
41+
}
42+
}
43+
44+
Monosaccharide "g" {
45+
// SMILES written starting at C4's OH, ending with C1's OH — this way, concatenating structures (removing one O)
46+
// creates a 1-4 glycosidic linkage
47+
isomer "β-D" "O[C@H]([C@H](O1)CO)[C@H](O)[C@@H](NC(=O)C)[C@@H]1O"
48+
}
49+
50+
Monosaccharide "m" {
51+
isomer "β-D" "O[C@H]([C@H](O1)CO)[C@H](O[C@H](C)C(O)=O)[C@@H](NC(=O)C)[C@@H]1O"
52+
53+
// The `O` is lost from the MurNAc's lactyl group to form the peptide bond — the captured regex group is the one lost
54+
// and replaced with a link during bonding
55+
bond-sites "Stem" donor=r"(\(O\))=O"
56+
57+
modification "Glyc" replace=r"=O\)C" with="=O)CO"
58+
// NOTE: Sometimes you'll need to perform several small modifications — we avoid trying to match and replace large
59+
// sections of the string so that this modification doesn't overwrite any others!
60+
modification "Anh" {
61+
replace "$" with="2"
62+
replace r"\)CO\)\[C@H" with=")C2)[C@H"
63+
}
64+
}
65+
66+
AminoAcid "A" {
67+
sidechain "C"
68+
}
69+
70+
AminoAcid "B" {
71+
sidechain "CCN"
72+
73+
// Type B2γ peptidoglycan from Fig 3 of https://doi.org/10.1099/00207713-48-2-403
74+
lateral use="D"
75+
76+
// FIXME: Probably better if we take functional groups into account in the future — this is leading to a lot of
77+
// repetition at the moment...
78+
// FIXME: Maybe these could be changed to `acceptor "CN()"` for all bonds, or if there are several groups that need to
79+
// be used for different bonds, then `acceptor "CN()" "CToN" "Link"` etc? Or really, I think this problem would be
80+
// solved if I kept the current format, but instead of requiring exactly one bond name, I allowed 0+, so these would
81+
// all be valid:
82+
// 1) `bond-sites acceptor="CN()"`
83+
// 2) `bond-sites "CToN" acceptor="CN()"`, and
84+
// 3) `bond-sites "CToN" "Link" acceptor="CN()"`
85+
// And so on...
86+
bond-sites "CToN" acceptor="CN()"
87+
bond-sites "Link" acceptor="CN()"
88+
}
89+
90+
AminoAcid "C" {
91+
sidechain "CS"
92+
}
93+
94+
AminoAcid "D" {
95+
sidechain "CC(O)=O"
96+
97+
// By Steph's request
98+
lateral use="D"
99+
100+
bond-sites "NToC" donor=r"(\(O\))=O\)"
101+
102+
modification "Am" replace=r"O\)=O\)" with="N)=O)"
103+
}
104+
105+
AminoAcid "E" {
106+
sidechain "CCC(O)=O"
107+
108+
// Same a normal D isomer, but I've swapped the sidechain and `COOH` so bonds are made to the side-chain (which now
109+
// comes last in the SMILES) — because of this swap, `@` must become `@@` to keep the overall same shape
110+
isomer "D-iso" "N[C@@H](C(O)=O)CCC(O)=O"
111+
stem 2 use="D-iso"
112+
113+
bond-sites "NToC" donor=r"(\(O\))=O\)"
114+
115+
modification "Am" replace=r"O\)=O\)" with="N)=O)"
116+
}
117+
118+
AminoAcid "F" {
119+
sidechain "Cc1ccccc1"
120+
}
121+
122+
AminoAcid "G" {
123+
// Here, `null` is just a nameless "isomer" — it's always picked since the lack of `sidechain` means that the L and D
124+
// isomers aren't defined
125+
isomer null "NCC(O)=O"
126+
}
127+
128+
AminoAcid "H" {
129+
sidechain "Cc1nc[nH]c1"
130+
}
131+
132+
AminoAcid "I" {
133+
// Chirality requires the central carbon to have 4 different substituents, this is our first amino acid with another
134+
// chiral center in the sidechain (so far all other sidechain carbons have had 2+ hydrogens). Importantly, the
135+
// chirality of the L/D-isoleucine sidechains is *different*, so we need to manually give both isomers.
136+
isomer "L" "N[C@@H]([C@H](CC)C)C(O)=O"
137+
isomer "D" "N[C@H]([C@@H](CC)C)C(O)=O"
138+
}
139+
140+
AminoAcid "J" {
141+
// No `sidechain`, so no L and D forms...
142+
isomer "meso" "N[C@@H](CCC[C@@H](N)C(O)=O)C(O)=O"
143+
144+
bond-sites "NToC" donor=r"(\(O\))=O\)"
145+
bond-sites "CToN" acceptor=r"\(N()\)"
146+
bond-sites "Link" acceptor=r"\(N()\)"
147+
148+
modification "Am" replace=r"O\)=O\)" with="N)=O)"
149+
}
150+
151+
AminoAcid "K" {
152+
sidechain "CCCCN"
153+
154+
bond-sites "CToN" acceptor="CN()"
155+
bond-sites "Link" acceptor="CN()"
156+
}
157+
158+
AminoAcid "L" {
159+
sidechain "CC(C)(C)"
160+
}
161+
162+
AminoAcid "M" {
163+
sidechain "CCSC"
164+
}
165+
166+
AminoAcid "N" {
167+
sidechain "CC(=O)N"
168+
169+
// By Steph's request
170+
lateral use="D"
171+
172+
bond-sites "CToN" acceptor=r"\(=O\)N()"
173+
bond-sites "Link" acceptor=r"\(=O\)N()"
174+
}
175+
176+
AminoAcid "O" {
177+
sidechain "CCCN"
178+
179+
bond-sites "CToN" acceptor="CN()"
180+
bond-sites "Link" acceptor="CN()"
181+
}
182+
183+
AminoAcid "P" {
184+
// No `sidechain` since proline's sidechain loops back to the N-terminal — we'll need a different backbone
185+
isomer "L" "N1[C@@H](CCC1)C(O)=O"
186+
isomer "D" "N1[C@H](CCC1)C(O)=O"
187+
}
188+
189+
AminoAcid "Q" {
190+
sidechain "CCC(=O)N"
191+
192+
// Same a normal D isomer, but I've swapped the sidechain and `COOH` so bonds are made to the side-chain (which now
193+
// comes last in the SMILES) — because of this swap, `@` must become `@@` to keep the overall same shape
194+
isomer "D-iso" "N[C@@H](C(=O)N)CCC(O)=O"
195+
stem 2 use="D-iso"
196+
197+
bond-sites "CToN" acceptor=r"\(=O\)N()"
198+
bond-sites "Link" acceptor=r"\(=O\)N()"
199+
}
200+
201+
AminoAcid "R" {
202+
sidechain "CCCNC(=N)N"
203+
}
204+
205+
AminoAcid "S" {
206+
sidechain "CO"
207+
}
208+
209+
AminoAcid "T" {
210+
// The stereochemistry here is evil — D-Threonine, has a change in the chirality of its sidechain (when compared to
211+
// L-Threonine). Because the sidechain changes, we just need to give both isomers from scratch and forget about the
212+
// templates (so no `sidechain` here)!
213+
isomer "L" "N[C@@H]([C@@H](C)O)C(O)=O"
214+
isomer "D" "N[C@H]([C@H](C)O)C(O)=O"
215+
}
216+
217+
AminoAcid "U" {
218+
sidechain "CCO"
219+
}
220+
221+
AminoAcid "V" {
222+
sidechain "C(C)C"
223+
}
224+
225+
AminoAcid "W" {
226+
sidechain "Cc1c2c(cccc2)nc1"
227+
}
228+
229+
AminoAcid "Y" {
230+
sidechain "Cc1ccc(O)cc1"
231+
}
232+
233+
AminoAcid "Z" {
234+
// Worst one yet... Connected through the γ-carbon like D-isoglutamic acid
235+
// (https://doi.org/10.1128/br.36.4.407-477.1972; Fig 5), but also like threonine where the L and D forms have
236+
// different sidechains (chirality-wise): https://doi.org/10.1002/jlac.199419940618
237+
isomer "L" "N[C@@H]([C@H](O)CC(=O)O)C(O)=O"
238+
isomer "D" "N[C@H]([C@@H](O)CC(=O)O)C(O)=O"
239+
isomer "D-iso" "N[C@@H](C(=O)O)[C@@H](O)CC(O)=O"
240+
stem 2 use="D-iso"
241+
}
242+
}
243+

crates/muropeptide/src/lib.rs

+21
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
//! Responsible for parsing strings into meaningful `Muropeptide` structures
22
mod parser;
3+
mod smiles_database;
34

45
use std::fmt::{self, Display, Formatter};
56

@@ -13,9 +14,19 @@ use polychem::{
1314
AtomicDatabase, AverageMass, BondId, Charged, GroupState, Massive, ModificationInfo,
1415
MonoisotopicMass, Polymer, PolymerDatabase, Polymerizer, ResidueId, errors::PolychemError,
1516
};
17+
use smiles_database::SmilesDatabase;
1618
use smithereens::Dissociable;
1719
use thiserror::Error;
1820

21+
// FIXME: Need to think about if these should really live in another KDL config?
22+
const PEPTIDE_BOND: &str = "Pep";
23+
const GLYCOSIDIC_BOND: &str = "Gly";
24+
const STEM_BOND: &str = "Stem";
25+
const NTOC_BOND: &str = "NToC";
26+
const CTON_BOND: &str = "CToN";
27+
const CROSSLINK_BOND: &str = "Link";
28+
const LAT_CROSSLINK_BOND: &str = "Lat-Link";
29+
1930
// FIXME: These need more thought / are a temporary hack!
2031
static ATOMIC_DB: Lazy<AtomicDatabase> = Lazy::new(AtomicDatabase::default);
2132
pub static POLYMER_DB: Lazy<PolymerDatabase> = Lazy::new(|| {
@@ -27,8 +38,18 @@ pub static POLYMER_DB: Lazy<PolymerDatabase> = Lazy::new(|| {
2738
.unwrap()
2839
});
2940

41+
// FIXME: This maybe shouldn't be here long term? Needs some thought...
3042
pub static POLYMERIZER: Lazy<Polymerizer> = Lazy::new(|| Polymerizer::new(&ATOMIC_DB, &POLYMER_DB));
3143

44+
// FIXME: This maybe shouldn't be here long term? Needs some thought...
45+
pub static SMILES_DB: Lazy<SmilesDatabase> = Lazy::new(|| {
46+
SmilesDatabase::new(
47+
"smiles_database.kdl",
48+
include_str!("../data/smiles_database.kdl"),
49+
)
50+
.unwrap()
51+
});
52+
3253
const AUTO_MODS: [&str; 1] = ["Red"];
3354

3455
#[derive(Debug)]

0 commit comments

Comments
 (0)