|
| 1 | +residues { |
| 2 | + types { |
| 3 | + Monosaccharide { |
| 4 | + bond-sites "Gly" donor="(O)$" acceptor="^O()" |
| 5 | + |
| 6 | + // FIXME: I should make this like `bond-sites`, where only a capture group is replaced — that way I can give less |
| 7 | + // verbose `with=` strings... |
| 8 | + modification "Ac" replace=r"\)CO\)\[C@H" with=")COC(=O)C)[C@H" |
| 9 | + modification "DeAc" replace=r"NC\(=O\)C" with="N" |
| 10 | + modification "Poly" replace=r"\)CO\)\[C@H" with=")COP(O)(O)=O)[C@H" |
| 11 | + modification "Red" { |
| 12 | + replace "1" with="" |
| 13 | + // FIXME: With only two non-H substituents, the carbon is no longer chiral! |
| 14 | + replace r"\[C@@H\]O$" with="CO" |
| 15 | + } |
| 16 | + } |
| 17 | + |
| 18 | + AminoAcid { |
| 19 | + // Template SMILES — the `sidechain` is inserted into this template (removing the `<>`) |
| 20 | + isomer "L" "N[C@@H](<sidechain>)C(O)=O" |
| 21 | + isomer "D" "N[C@H](<sidechain>)C(O)=O" |
| 22 | + |
| 23 | + // Bonding site regex — the match is replaced by a link to the acceptor residue |
| 24 | + // NOTE: I really wish I could write something like `N[C@@H](<sidechain>)C(=O)O` for my amino acids, and then have |
| 25 | + // this rule just be something clean like `(O)$`, but that would ultimately generate something like `C(=O)1`, |
| 26 | + // which seems reasonable, and seems to be accepted perfectly fine by most SMILES tools, but technically |
| 27 | + // OpenSMILES demands that those "ring closure" `rnum`s come *before* any branches, so it must be `C1(=O)` or |
| 28 | + // `C1=O` instead. Aggravating, dumb, stupid even, but I'll obey for the sake of the widest possible compatibility |
| 29 | + bond-sites "Pep" donor=r"(\(O\))=O$" acceptor="^N()" |
| 30 | + // FIXME: This remains super dumb and is the fault of known shortcomings in the polymer database KDL format... |
| 31 | + bond-sites "Link" donor=r"(\(O\))=O$" acceptor="^N()" |
| 32 | + // FIXME: This is a sign that things are starting to go wrong... I should be using the functional group |
| 33 | + // abstraction! |
| 34 | + bond-sites "Stem" acceptor="^N()" |
| 35 | + bond-sites "CToN" donor=r"(\(O\))=O$" |
| 36 | + bond-sites "NToC" acceptor="^N()" |
| 37 | + |
| 38 | + // Implicit stereoisomer selection rules — isomers are picked in order of definition unless a rule like |
| 39 | + // `stem` or `lateral` matches, then that `use=` isomer is picked |
| 40 | + stem 4 5 use="D" |
| 41 | + } |
| 42 | + } |
| 43 | + |
| 44 | + Monosaccharide "g" { |
| 45 | + // SMILES written starting at C4's OH, ending with C1's OH — this way, concatenating structures (removing one O) |
| 46 | + // creates a 1-4 glycosidic linkage |
| 47 | + isomer "β-D" "O[C@H]([C@H](O1)CO)[C@H](O)[C@@H](NC(=O)C)[C@@H]1O" |
| 48 | + } |
| 49 | + |
| 50 | + Monosaccharide "m" { |
| 51 | + isomer "β-D" "O[C@H]([C@H](O1)CO)[C@H](O[C@H](C)C(O)=O)[C@@H](NC(=O)C)[C@@H]1O" |
| 52 | + |
| 53 | + // The `O` is lost from the MurNAc's lactyl group to form the peptide bond — the captured regex group is the one lost |
| 54 | + // and replaced with a link during bonding |
| 55 | + bond-sites "Stem" donor=r"(\(O\))=O" |
| 56 | + |
| 57 | + modification "Glyc" replace=r"=O\)C" with="=O)CO" |
| 58 | + // NOTE: Sometimes you'll need to perform several small modifications — we avoid trying to match and replace large |
| 59 | + // sections of the string so that this modification doesn't overwrite any others! |
| 60 | + modification "Anh" { |
| 61 | + replace "$" with="2" |
| 62 | + replace r"\)CO\)\[C@H" with=")C2)[C@H" |
| 63 | + } |
| 64 | + } |
| 65 | + |
| 66 | + AminoAcid "A" { |
| 67 | + sidechain "C" |
| 68 | + } |
| 69 | + |
| 70 | + AminoAcid "B" { |
| 71 | + sidechain "CCN" |
| 72 | + |
| 73 | + // Type B2γ peptidoglycan from Fig 3 of https://doi.org/10.1099/00207713-48-2-403 |
| 74 | + lateral use="D" |
| 75 | + |
| 76 | + // FIXME: Probably better if we take functional groups into account in the future — this is leading to a lot of |
| 77 | + // repetition at the moment... |
| 78 | + // FIXME: Maybe these could be changed to `acceptor "CN()"` for all bonds, or if there are several groups that need to |
| 79 | + // be used for different bonds, then `acceptor "CN()" "CToN" "Link"` etc? Or really, I think this problem would be |
| 80 | + // solved if I kept the current format, but instead of requiring exactly one bond name, I allowed 0+, so these would |
| 81 | + // all be valid: |
| 82 | + // 1) `bond-sites acceptor="CN()"` |
| 83 | + // 2) `bond-sites "CToN" acceptor="CN()"`, and |
| 84 | + // 3) `bond-sites "CToN" "Link" acceptor="CN()"` |
| 85 | + // And so on... |
| 86 | + bond-sites "CToN" acceptor="CN()" |
| 87 | + bond-sites "Link" acceptor="CN()" |
| 88 | + } |
| 89 | + |
| 90 | + AminoAcid "C" { |
| 91 | + sidechain "CS" |
| 92 | + } |
| 93 | + |
| 94 | + AminoAcid "D" { |
| 95 | + sidechain "CC(O)=O" |
| 96 | + |
| 97 | + // By Steph's request |
| 98 | + lateral use="D" |
| 99 | + |
| 100 | + bond-sites "NToC" donor=r"(\(O\))=O\)" |
| 101 | + |
| 102 | + modification "Am" replace=r"O\)=O\)" with="N)=O)" |
| 103 | + } |
| 104 | + |
| 105 | + AminoAcid "E" { |
| 106 | + sidechain "CCC(O)=O" |
| 107 | + |
| 108 | + // Same a normal D isomer, but I've swapped the sidechain and `COOH` so bonds are made to the side-chain (which now |
| 109 | + // comes last in the SMILES) — because of this swap, `@` must become `@@` to keep the overall same shape |
| 110 | + isomer "D-iso" "N[C@@H](C(O)=O)CCC(O)=O" |
| 111 | + stem 2 use="D-iso" |
| 112 | + |
| 113 | + bond-sites "NToC" donor=r"(\(O\))=O\)" |
| 114 | + |
| 115 | + modification "Am" replace=r"O\)=O\)" with="N)=O)" |
| 116 | + } |
| 117 | + |
| 118 | + AminoAcid "F" { |
| 119 | + sidechain "Cc1ccccc1" |
| 120 | + } |
| 121 | + |
| 122 | + AminoAcid "G" { |
| 123 | + // Here, `null` is just a nameless "isomer" — it's always picked since the lack of `sidechain` means that the L and D |
| 124 | + // isomers aren't defined |
| 125 | + isomer null "NCC(O)=O" |
| 126 | + } |
| 127 | + |
| 128 | + AminoAcid "H" { |
| 129 | + sidechain "Cc1nc[nH]c1" |
| 130 | + } |
| 131 | + |
| 132 | + AminoAcid "I" { |
| 133 | + // Chirality requires the central carbon to have 4 different substituents, this is our first amino acid with another |
| 134 | + // chiral center in the sidechain (so far all other sidechain carbons have had 2+ hydrogens). Importantly, the |
| 135 | + // chirality of the L/D-isoleucine sidechains is *different*, so we need to manually give both isomers. |
| 136 | + isomer "L" "N[C@@H]([C@H](CC)C)C(O)=O" |
| 137 | + isomer "D" "N[C@H]([C@@H](CC)C)C(O)=O" |
| 138 | + } |
| 139 | + |
| 140 | + AminoAcid "J" { |
| 141 | + // No `sidechain`, so no L and D forms... |
| 142 | + isomer "meso" "N[C@@H](CCC[C@@H](N)C(O)=O)C(O)=O" |
| 143 | + |
| 144 | + bond-sites "NToC" donor=r"(\(O\))=O\)" |
| 145 | + bond-sites "CToN" acceptor=r"\(N()\)" |
| 146 | + bond-sites "Link" acceptor=r"\(N()\)" |
| 147 | + |
| 148 | + modification "Am" replace=r"O\)=O\)" with="N)=O)" |
| 149 | + } |
| 150 | + |
| 151 | + AminoAcid "K" { |
| 152 | + sidechain "CCCCN" |
| 153 | + |
| 154 | + bond-sites "CToN" acceptor="CN()" |
| 155 | + bond-sites "Link" acceptor="CN()" |
| 156 | + } |
| 157 | + |
| 158 | + AminoAcid "L" { |
| 159 | + sidechain "CC(C)(C)" |
| 160 | + } |
| 161 | + |
| 162 | + AminoAcid "M" { |
| 163 | + sidechain "CCSC" |
| 164 | + } |
| 165 | + |
| 166 | + AminoAcid "N" { |
| 167 | + sidechain "CC(=O)N" |
| 168 | + |
| 169 | + // By Steph's request |
| 170 | + lateral use="D" |
| 171 | + |
| 172 | + bond-sites "CToN" acceptor=r"\(=O\)N()" |
| 173 | + bond-sites "Link" acceptor=r"\(=O\)N()" |
| 174 | + } |
| 175 | + |
| 176 | + AminoAcid "O" { |
| 177 | + sidechain "CCCN" |
| 178 | + |
| 179 | + bond-sites "CToN" acceptor="CN()" |
| 180 | + bond-sites "Link" acceptor="CN()" |
| 181 | + } |
| 182 | + |
| 183 | + AminoAcid "P" { |
| 184 | + // No `sidechain` since proline's sidechain loops back to the N-terminal — we'll need a different backbone |
| 185 | + isomer "L" "N1[C@@H](CCC1)C(O)=O" |
| 186 | + isomer "D" "N1[C@H](CCC1)C(O)=O" |
| 187 | + } |
| 188 | + |
| 189 | + AminoAcid "Q" { |
| 190 | + sidechain "CCC(=O)N" |
| 191 | + |
| 192 | + // Same a normal D isomer, but I've swapped the sidechain and `COOH` so bonds are made to the side-chain (which now |
| 193 | + // comes last in the SMILES) — because of this swap, `@` must become `@@` to keep the overall same shape |
| 194 | + isomer "D-iso" "N[C@@H](C(=O)N)CCC(O)=O" |
| 195 | + stem 2 use="D-iso" |
| 196 | + |
| 197 | + bond-sites "CToN" acceptor=r"\(=O\)N()" |
| 198 | + bond-sites "Link" acceptor=r"\(=O\)N()" |
| 199 | + } |
| 200 | + |
| 201 | + AminoAcid "R" { |
| 202 | + sidechain "CCCNC(=N)N" |
| 203 | + } |
| 204 | + |
| 205 | + AminoAcid "S" { |
| 206 | + sidechain "CO" |
| 207 | + } |
| 208 | + |
| 209 | + AminoAcid "T" { |
| 210 | + // The stereochemistry here is evil — D-Threonine, has a change in the chirality of its sidechain (when compared to |
| 211 | + // L-Threonine). Because the sidechain changes, we just need to give both isomers from scratch and forget about the |
| 212 | + // templates (so no `sidechain` here)! |
| 213 | + isomer "L" "N[C@@H]([C@@H](C)O)C(O)=O" |
| 214 | + isomer "D" "N[C@H]([C@H](C)O)C(O)=O" |
| 215 | + } |
| 216 | + |
| 217 | + AminoAcid "U" { |
| 218 | + sidechain "CCO" |
| 219 | + } |
| 220 | + |
| 221 | + AminoAcid "V" { |
| 222 | + sidechain "C(C)C" |
| 223 | + } |
| 224 | + |
| 225 | + AminoAcid "W" { |
| 226 | + sidechain "Cc1c2c(cccc2)nc1" |
| 227 | + } |
| 228 | + |
| 229 | + AminoAcid "Y" { |
| 230 | + sidechain "Cc1ccc(O)cc1" |
| 231 | + } |
| 232 | + |
| 233 | + AminoAcid "Z" { |
| 234 | + // Worst one yet... Connected through the γ-carbon like D-isoglutamic acid |
| 235 | + // (https://doi.org/10.1128/br.36.4.407-477.1972; Fig 5), but also like threonine where the L and D forms have |
| 236 | + // different sidechains (chirality-wise): https://doi.org/10.1002/jlac.199419940618 |
| 237 | + isomer "L" "N[C@@H]([C@H](O)CC(=O)O)C(O)=O" |
| 238 | + isomer "D" "N[C@H]([C@@H](O)CC(=O)O)C(O)=O" |
| 239 | + isomer "D-iso" "N[C@@H](C(=O)O)[C@@H](O)CC(O)=O" |
| 240 | + stem 2 use="D-iso" |
| 241 | + } |
| 242 | +} |
| 243 | + |
0 commit comments