ferritin_core/info/
constants.rs

1//! # Constants
2//!
3//! This module contains core data structures and functions for analyzing molecular structures.
4//!
5//! ## Residue Types
6//! The module provides functions to check residue types:
7//!
8//! - `is_amino_acid()` - Check if a residue is an amino acid
9//! - `is_carbohydrate()` - Check if a residue is a carbohydrate
10//! - `is_nucleotide()` - Check if a residue is a nucleotide
11//!
12//! ## Bond Information
13//! Bond data from CCD includes:
14//!
15//! - Bond lengths with standard deviations
16//! - Canonical amino acid connectivity
17//! - Bond order information
18//!
19
20use std::collections::{HashMap, HashSet};
21use std::sync::OnceLock;
22
23#[rustfmt::skip]
24pub(crate) fn default_distance_range(a: &str, b: &str) -> (f32, f32) {
25    match (a, b) {
26        // https://github.com/biotite-dev/biotite/blob/main/src/biotite/structure/bonds.pyx#L1341C1-L1389C1
27        //               # Taken from Allen et al.
28        //                 min   - 2*std     max   + 2*std
29        ("B",  "C" ) => (1.556 - 2.0*0.015,  1.556 + 2.0*0.015),
30        ("BR", "C" ) => (1.875 - 2.0*0.029,  1.966 + 2.0*0.029),
31        ("BR", "O" ) => (1.581 - 2.0*0.007,  1.581 + 2.0*0.007),
32        ("C",  "C" ) => (1.174 - 2.0*0.011,  1.588 + 2.0*0.025),
33        ("C",  "CL") => (1.713 - 2.0*0.011,  1.849 + 2.0*0.011),
34        ("C",  "F" ) => (1.320 - 2.0*0.009,  1.428 + 2.0*0.009),
35        ("C",  "H" ) => (1.059 - 2.0*0.030,  1.099 + 2.0*0.007),
36        ("C",  "I" ) => (2.095 - 2.0*0.015,  2.162 + 2.0*0.015),
37        ("C",  "N" ) => (1.325 - 2.0*0.009,  1.552 + 2.0*0.023),
38        ("C",  "O" ) => (1.187 - 2.0*0.011,  1.477 + 2.0*0.008),
39        ("C",  "P" ) => (1.791 - 2.0*0.006,  1.855 + 2.0*0.019),
40        ("C",  "S" ) => (1.630 - 2.0*0.014,  1.863 + 2.0*0.015),
41        ("C",  "SE") => (1.893 - 2.0*0.013,  1.970 + 2.0*0.032),
42        ("C",  "SI") => (1.837 - 2.0*0.012,  1.888 + 2.0*0.023),
43        ("CL", "O" ) => (1.414 - 2.0*0.026,  1.414 + 2.0*0.026),
44        ("CL", "P" ) => (1.997 - 2.0*0.035,  2.008 + 2.0*0.035),
45        ("CL", "S" ) => (2.072 - 2.0*0.023,  2.072 + 2.0*0.023),
46        ("CL", "SI") => (2.072 - 2.0*0.009,  2.072 + 2.0*0.009),
47        ("F",  "N" ) => (1.406 - 2.0*0.016,  1.406 + 2.0*0.016),
48        ("F",  "P" ) => (1.495 - 2.0*0.016,  1.579 + 2.0*0.025),
49        ("F",  "S" ) => (1.640 - 2.0*0.011,  1.640 + 2.0*0.011),
50        ("F",  "SI") => (1.588 - 2.0*0.014,  1.694 + 2.0*0.013),
51        ("H",  "N" ) => (1.009 - 2.0*0.022,  1.033 + 2.0*0.022),
52        ("H",  "O" ) => (0.967 - 2.0*0.010,  1.015 + 2.0*0.017),
53        ("I",  "O" ) => (2.144 - 2.0*0.028,  2.144 + 2.0*0.028),
54        ("N",  "N" ) => (1.124 - 2.0*0.015,  1.454 + 2.0*0.021),
55        ("N",  "O" ) => (1.210 - 2.0*0.011,  1.463 + 2.0*0.012),
56        ("N",  "P" ) => (1.571 - 2.0*0.013,  1.697 + 2.0*0.015),
57        ("N",  "S" ) => (1.541 - 2.0*0.022,  1.710 + 2.0*0.019),
58        ("N",  "SI") => (1.711 - 2.0*0.019,  1.748 + 2.0*0.022),
59        ("O",  "P" ) => (1.449 - 2.0*0.007,  1.689 + 2.0*0.024),
60        ("O",  "S" ) => (1.423 - 2.0*0.008,  1.580 + 2.0*0.015),
61        ("O",  "SI") => (1.622 - 2.0*0.014,  1.680 + 2.0*0.008),
62        ("P",  "P" ) => (2.214 - 2.0*0.022,  2.214 + 2.0*0.022),
63        ("P",  "S" ) => (1.913 - 2.0*0.014,  1.954 + 2.0*0.005),
64        ("P",  "SE") => (2.093 - 2.0*0.019,  2.093 + 2.0*0.019),
65        ("P",  "SI") => (2.264 - 2.0*0.019,  2.264 + 2.0*0.019),
66        ("S",  "S" ) => (1.897 - 2.0*0.012,  2.070 + 2.0*0.022),
67        ("S",  "SE") => (2.193 - 2.0*0.015,  2.193 + 2.0*0.015),
68        ("S",  "SI") => (2.145 - 2.0*0.020,  2.145 + 2.0*0.020),
69        ("SE", "SE") => (2.340 - 2.0*0.024,  2.340 + 2.0*0.024),
70        ("SI", "SE") => (2.359 - 2.0*0.012,  2.359 + 2.0*0.012),
71        _ => panic!("Unknown atom pair: {} and {}", a, b),
72    }
73}
74
75static AA_BONDS: OnceLock<HashMap<&'static str, Vec<(&'static str, &'static str, i32)>>> =
76    OnceLock::new();
77
78#[rustfmt::skip]
79/// get_bonds_canonical20
80///
81/// This is the bond information for the 10 canonical
82/// AAs.  Data were obtained from the [CCD](https://www.wwpdb.org/data/ccd).
83///
84pub(crate) fn get_bonds_canonical20() -> &'static HashMap<&'static str, Vec<(&'static str, &'static str, i32)>> {
85    AA_BONDS.get_or_init(|| {
86        let mut m = HashMap::new();
87        m.insert("ALA", vec![
88            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
89            ("CB","HB1",1), ("CB","HB2",1), ("CB","HB3",1), ("CA","N",1), ("H","N",1),
90            ("H2","N",1), ("HXT","OXT",1)
91        ]);
92        m.insert("ARG", vec![
93            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
94            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","HD2",1), ("CD","HD3",1),
95            ("CD","NE",1), ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CZ","NH1",1),
96            ("CZ","NH2",2), ("CA","N",1), ("H","N",1), ("H2","N",1), ("CZ","NE",1),
97            ("HE","NE",1), ("HH11","NH1",1), ("HH12","NH1",1), ("HH21","NH2",1),
98            ("HH22","NH2",1), ("HXT","OXT",1)
99        ]);
100        m.insert("ASN", vec![
101            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
102            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CG","ND2",1), ("CG","OD1",2),
103            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HD21","ND2",1), ("HD22","ND2",1),
104            ("HXT","OXT",1)
105        ]);
106        m.insert("ASP", vec![
107            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
108            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CG","OD1",2), ("CG","OD2",1),
109            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HD2","OD2",1), ("HXT","OXT",1)
110        ]);
111        m.insert("CYS", vec![
112            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
113            ("CB","HB2",1), ("CB","HB3",1), ("CB","SG",1), ("CA","N",1), ("H","N",1),
114            ("H2","N",1), ("HXT","OXT",1), ("HG","SG",1)
115        ]);
116        m.insert("GLN", vec![
117            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
118            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","NE2",1), ("CD","OE1",2),
119            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1),
120            ("H2","N",1), ("HE21","NE2",1), ("HE22","NE2",1), ("HXT","OXT",1)
121        ]);
122        m.insert("GLU", vec![
123            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
124            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","OE1",2), ("CD","OE2",1),
125            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1),
126            ("H2","N",1), ("HE2","OE2",1), ("HXT","OXT",1)
127        ]);
128        m.insert("GLY", vec![
129            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","HA2",1), ("CA","HA3",1),
130            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
131        ]);
132        m.insert("HIS", vec![
133            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
134            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD2","HD2",1), ("CD2","NE2",5),
135            ("CE1","HE1",1), ("CE1","NE2",5), ("CD2","CG",6), ("CG","ND1",5), ("CA","N",1),
136            ("H","N",1), ("H2","N",1), ("CE1","ND1",6), ("HD1","ND1",1), ("HE2","NE2",1),
137            ("HXT","OXT",1)
138        ]);
139        m.insert("ILE", vec![
140            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
141            ("CB","CG1",1), ("CB","CG2",1), ("CB","HB",1), ("CD1","HD11",1), ("CD1","HD12",1),
142            ("CD1","HD13",1), ("CD1","CG1",1), ("CG1","HG12",1), ("CG1","HG13",1),
143            ("CG2","HG21",1), ("CG2","HG22",1), ("CG2","HG23",1), ("CA","N",1), ("H","N",1),
144            ("H2","N",1), ("HXT","OXT",1)
145        ]);
146        m.insert("LEU", vec![
147            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
148            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","HD11",1), ("CD1","HD12",1),
149            ("CD1","HD13",1), ("CD2","HD21",1), ("CD2","HD22",1), ("CD2","HD23",1),
150            ("CD1","CG",1), ("CD2","CG",1), ("CG","HG",1), ("CA","N",1), ("H","N",1),
151            ("H2","N",1), ("HXT","OXT",1)
152        ]);
153        m.insert("LYS", vec![
154            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
155            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","CE",1), ("CD","HD2",1),
156            ("CD","HD3",1), ("CE","HE2",1), ("CE","HE3",1), ("CE","NZ",1), ("CD","CG",1),
157            ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1), ("H2","N",1),
158            ("HZ1","NZ",1), ("HZ2","NZ",1), ("HZ3","NZ",1), ("HXT","OXT",1)
159        ]);
160        m.insert("MET", vec![
161            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
162            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CE","HE1",1), ("CE","HE2",1),
163            ("CE","HE3",1), ("CG","HG2",1), ("CG","HG3",1), ("CG","SD",1), ("CA","N",1),
164            ("H","N",1), ("H2","N",1), ("HXT","OXT",1), ("CE","SD",1)
165        ]);
166        m.insert("PHE", vec![
167            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
168            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","CE1",5), ("CD1","HD1",1),
169            ("CD2","CE2",6), ("CD2","HD2",1), ("CE1","CZ",6), ("CE1","HE1",1), ("CE2","CZ",5),
170            ("CE2","HE2",1), ("CD1","CG",6), ("CD2","CG",5), ("CZ","HZ",1), ("CA","N",1),
171            ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
172        ]);
173        m.insert("PRO", vec![
174            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
175            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","HD2",1), ("CD","HD3",1),
176            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("CD","N",1),
177            ("H","N",1), ("HXT","OXT",1)
178        ]);
179        m.insert("SER", vec![
180            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
181            ("CB","HB2",1), ("CB","HB3",1), ("CB","OG",1), ("CA","N",1), ("H","N",1),
182            ("H2","N",1), ("HG","OG",1), ("HXT","OXT",1)
183        ]);
184        m.insert("THR", vec![
185            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
186            ("CB","CG2",1), ("CB","HB",1), ("CB","OG1",1), ("CG2","HG21",1), ("CG2","HG22",1),
187            ("CG2","HG23",1), ("CA","N",1), ("H","N",1), ("H2","N",1), ("HG1","OG1",1),
188            ("HXT","OXT",1)
189        ]);
190        m.insert("TRP", vec![
191            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
192            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","HD1",1), ("CD1","NE1",5),
193            ("CD2","CE2",6), ("CD2","CE3",5), ("CE2","CZ2",5), ("CE3","CZ3",6), ("CE3","HE3",1),
194            ("CD1","CG",6), ("CD2","CG",5), ("CH2","HH2",1), ("CH2","CZ2",6), ("CZ2","HZ2",1),
195            ("CH2","CZ3",5), ("CZ3","HZ3",1), ("CA","N",1), ("H","N",1), ("H2","N",1),
196            ("CE2","NE1",5), ("HE1","NE1",1), ("HXT","OXT",1)
197        ]);
198        m.insert("TYR", vec![
199            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
200            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","CE1",5), ("CD1","HD1",1),
201            ("CD2","CE2",6), ("CD2","HD2",1), ("CE1","CZ",6), ("CE1","HE1",1), ("CE2","CZ",5),
202            ("CE2","HE2",1), ("CD1","CG",6), ("CD2","CG",5), ("CZ","OH",1), ("CA","N",1),
203            ("H","N",1), ("H2","N",1), ("HH","OH",1), ("HXT","OXT",1)
204        ]);
205        m.insert("VAL", vec![
206            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
207            ("CB","CG1",1), ("CB","CG2",1), ("CB","HB",1), ("CG1","HG11",1), ("CG1","HG12",1),
208            ("CG1","HG13",1), ("CG2","HG21",1), ("CG2","HG22",1), ("CG2","HG23",1),
209            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
210        ]);
211        m
212    })
213}
214
215static AMINO_ACIDS: OnceLock<HashSet<&'static str>> = OnceLock::new();
216static CARBOHYDRATES: OnceLock<HashSet<&'static str>> = OnceLock::new();
217static NUCLEOTIDES: OnceLock<HashSet<&'static str>> = OnceLock::new();
218
219fn get_amino_acids() -> &'static HashSet<&'static str> {
220    AMINO_ACIDS.get_or_init(|| include_str!("ccddata/amino_acids.txt").lines().collect())
221}
222
223fn get_carbohydrates() -> &'static HashSet<&'static str> {
224    CARBOHYDRATES.get_or_init(|| include_str!("ccddata/carbohydrates.txt").lines().collect())
225}
226
227fn get_nucleotides() -> &'static HashSet<&'static str> {
228    NUCLEOTIDES.get_or_init(|| include_str!("ccddata/nucleotides.txt").lines().collect())
229}
230
231pub(crate) fn is_amino_acid(symbol: &str) -> bool {
232    get_amino_acids().contains(symbol)
233}
234
235pub(crate) fn is_carbohydrate(symbol: &str) -> bool {
236    get_carbohydrates().contains(symbol)
237}
238
239pub(crate) fn is_nucleotide(symbol: &str) -> bool {
240    get_nucleotides().contains(symbol)
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn test_residue_checking() {
249        assert!(is_amino_acid("ALA"));
250        assert!(is_amino_acid("ARG"));
251        assert!(!is_amino_acid("ZZZ"));
252
253        assert!(is_carbohydrate("045"));
254        assert!(is_carbohydrate("05L"));
255        assert!(!is_carbohydrate("ZZZ"));
256
257        assert!(is_nucleotide("02I"));
258        assert!(is_nucleotide("05A"));
259        assert!(!is_nucleotide("ZZZ"));
260    }
261}