Skip to main content

ferritin_core/info/
constants.rs

1//! # Constants
2//!
3//! This module contains core data structures and functions for analyzing molecular structures.
4//!
5//! ## Residue Types
6//! The module provides functions to check residue types:
7//!
8//! - `is_amino_acid()` - Check if a residue is an amino acid
9//! - `is_carbohydrate()` - Check if a residue is a carbohydrate
10//! - `is_nucleotide()` - Check if a residue is a nucleotide
11//!
12//! ## Bond Information
13//! Bond data from CCD includes:
14//!
15//! - Bond lengths with standard deviations
16//! - Canonical amino acid connectivity
17//! - Bond order information
18//!
19
20use std::collections::{HashMap, HashSet};
21use std::sync::OnceLock;
22
23#[rustfmt::skip]
24pub(crate) fn default_distance_range(a: &str, b: &str) -> (f32, f32) {
25    match (a, b) {
26        // https://github.com/biotite-dev/biotite/blob/main/src/biotite/structure/bonds.pyx#L1341C1-L1389C1
27        //               # Taken from Allen et al.
28        //                 min   - 2*std     max   + 2*std
29        ("B",  "C" ) => (1.556 - 2.0*0.015,  1.556 + 2.0*0.015),
30        ("BR", "C" ) => (1.875 - 2.0*0.029,  1.966 + 2.0*0.029),
31        ("BR", "O" ) => (1.581 - 2.0*0.007,  1.581 + 2.0*0.007),
32        ("C",  "C" ) => (1.174 - 2.0*0.011,  1.588 + 2.0*0.025),
33        ("C",  "CL") => (1.713 - 2.0*0.011,  1.849 + 2.0*0.011),
34        ("C",  "F" ) => (1.320 - 2.0*0.009,  1.428 + 2.0*0.009),
35        ("C",  "H" ) => (1.059 - 2.0*0.030,  1.099 + 2.0*0.007),
36        ("C",  "I" ) => (2.095 - 2.0*0.015,  2.162 + 2.0*0.015),
37        ("C",  "N" ) => (1.325 - 2.0*0.009,  1.552 + 2.0*0.023),
38        ("C",  "O" ) => (1.187 - 2.0*0.011,  1.477 + 2.0*0.008),
39        ("C",  "P" ) => (1.791 - 2.0*0.006,  1.855 + 2.0*0.019),
40        ("C",  "S" ) => (1.630 - 2.0*0.014,  1.863 + 2.0*0.015),
41        ("C",  "SE") => (1.893 - 2.0*0.013,  1.970 + 2.0*0.032),
42        ("C",  "SI") => (1.837 - 2.0*0.012,  1.888 + 2.0*0.023),
43        ("CL", "O" ) => (1.414 - 2.0*0.026,  1.414 + 2.0*0.026),
44        ("CL", "P" ) => (1.997 - 2.0*0.035,  2.008 + 2.0*0.035),
45        ("CL", "S" ) => (2.072 - 2.0*0.023,  2.072 + 2.0*0.023),
46        ("CL", "SI") => (2.072 - 2.0*0.009,  2.072 + 2.0*0.009),
47        ("F",  "N" ) => (1.406 - 2.0*0.016,  1.406 + 2.0*0.016),
48        ("F",  "P" ) => (1.495 - 2.0*0.016,  1.579 + 2.0*0.025),
49        ("F",  "S" ) => (1.640 - 2.0*0.011,  1.640 + 2.0*0.011),
50        ("F",  "SI") => (1.588 - 2.0*0.014,  1.694 + 2.0*0.013),
51        ("H",  "N" ) => (1.009 - 2.0*0.022,  1.033 + 2.0*0.022),
52        ("H",  "O" ) => (0.967 - 2.0*0.010,  1.015 + 2.0*0.017),
53        ("I",  "O" ) => (2.144 - 2.0*0.028,  2.144 + 2.0*0.028),
54        ("N",  "N" ) => (1.124 - 2.0*0.015,  1.454 + 2.0*0.021),
55        ("N",  "O" ) => (1.210 - 2.0*0.011,  1.463 + 2.0*0.012),
56        ("N",  "P" ) => (1.571 - 2.0*0.013,  1.697 + 2.0*0.015),
57        ("N",  "S" ) => (1.541 - 2.0*0.022,  1.710 + 2.0*0.019),
58        ("N",  "SI") => (1.711 - 2.0*0.019,  1.748 + 2.0*0.022),
59        ("O",  "P" ) => (1.449 - 2.0*0.007,  1.689 + 2.0*0.024),
60        ("O",  "S" ) => (1.423 - 2.0*0.008,  1.580 + 2.0*0.015),
61        ("O",  "SI") => (1.622 - 2.0*0.014,  1.680 + 2.0*0.008),
62        ("P",  "P" ) => (2.214 - 2.0*0.022,  2.214 + 2.0*0.022),
63        ("P",  "S" ) => (1.913 - 2.0*0.014,  1.954 + 2.0*0.005),
64        ("P",  "SE") => (2.093 - 2.0*0.019,  2.093 + 2.0*0.019),
65        ("P",  "SI") => (2.264 - 2.0*0.019,  2.264 + 2.0*0.019),
66        ("S",  "S" ) => (1.897 - 2.0*0.012,  2.070 + 2.0*0.022),
67        ("S",  "SE") => (2.193 - 2.0*0.015,  2.193 + 2.0*0.015),
68        ("S",  "SI") => (2.145 - 2.0*0.020,  2.145 + 2.0*0.020),
69        ("SE", "SE") => (2.340 - 2.0*0.024,  2.340 + 2.0*0.024),
70        ("SI", "SE") => (2.359 - 2.0*0.012,  2.359 + 2.0*0.012),
71        _ => panic!("Unknown atom pair: {} and {}", a, b),
72    }
73}
74
75static AA_BONDS: OnceLock<HashMap<&'static str, Vec<(&'static str, &'static str, i32)>>> =
76    OnceLock::new();
77
78#[rustfmt::skip]
79/// get_bonds_canonical20
80///
81/// This is the bond information for the 10 canonical
82/// AAs.  Data were obtained from the [CCD](https://www.wwpdb.org/data/ccd).
83///
84pub(crate) fn get_bonds_canonical20() -> &'static HashMap<&'static str, Vec<(&'static str, &'static str, i32)>> {
85    AA_BONDS.get_or_init(|| {
86        let mut m = HashMap::new();
87        m.insert("ALA", vec![
88            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
89            ("CB","HB1",1), ("CB","HB2",1), ("CB","HB3",1), ("CA","N",1), ("H","N",1),
90            ("H2","N",1), ("HXT","OXT",1)
91        ]);
92        m.insert("ARG", vec![
93            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
94            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","HD2",1), ("CD","HD3",1),
95            ("CD","NE",1), ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CZ","NH1",1),
96            ("CZ","NH2",2), ("CA","N",1), ("H","N",1), ("H2","N",1), ("CZ","NE",1),
97            ("HE","NE",1), ("HH11","NH1",1), ("HH12","NH1",1), ("HH21","NH2",1),
98            ("HH22","NH2",1), ("HXT","OXT",1)
99        ]);
100        m.insert("ASN", vec![
101            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
102            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CG","ND2",1), ("CG","OD1",2),
103            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HD21","ND2",1), ("HD22","ND2",1),
104            ("HXT","OXT",1)
105        ]);
106        m.insert("ASP", vec![
107            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
108            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CG","OD1",2), ("CG","OD2",1),
109            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HD2","OD2",1), ("HXT","OXT",1)
110        ]);
111        m.insert("CYS", vec![
112            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
113            ("CB","HB2",1), ("CB","HB3",1), ("CB","SG",1), ("CA","N",1), ("H","N",1),
114            ("H2","N",1), ("HXT","OXT",1), ("HG","SG",1)
115        ]);
116        m.insert("GLN", vec![
117            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
118            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","NE2",1), ("CD","OE1",2),
119            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1),
120            ("H2","N",1), ("HE21","NE2",1), ("HE22","NE2",1), ("HXT","OXT",1)
121        ]);
122        m.insert("GLU", vec![
123            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
124            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","OE1",2), ("CD","OE2",1),
125            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1),
126            ("H2","N",1), ("HE2","OE2",1), ("HXT","OXT",1)
127        ]);
128        m.insert("GLY", vec![
129            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","HA2",1), ("CA","HA3",1),
130            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
131        ]);
132        m.insert("HIS", vec![
133            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
134            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD2","HD2",1), ("CD2","NE2",5),
135            ("CE1","HE1",1), ("CE1","NE2",5), ("CD2","CG",6), ("CG","ND1",5), ("CA","N",1),
136            ("H","N",1), ("H2","N",1), ("CE1","ND1",6), ("HD1","ND1",1), ("HE2","NE2",1),
137            ("HXT","OXT",1)
138        ]);
139        m.insert("ILE", vec![
140            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
141            ("CB","CG1",1), ("CB","CG2",1), ("CB","HB",1), ("CD1","HD11",1), ("CD1","HD12",1),
142            ("CD1","HD13",1), ("CD1","CG1",1), ("CG1","HG12",1), ("CG1","HG13",1),
143            ("CG2","HG21",1), ("CG2","HG22",1), ("CG2","HG23",1), ("CA","N",1), ("H","N",1),
144            ("H2","N",1), ("HXT","OXT",1)
145        ]);
146        m.insert("LEU", vec![
147            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
148            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","HD11",1), ("CD1","HD12",1),
149            ("CD1","HD13",1), ("CD2","HD21",1), ("CD2","HD22",1), ("CD2","HD23",1),
150            ("CD1","CG",1), ("CD2","CG",1), ("CG","HG",1), ("CA","N",1), ("H","N",1),
151            ("H2","N",1), ("HXT","OXT",1)
152        ]);
153        m.insert("LYS", vec![
154            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
155            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","CE",1), ("CD","HD2",1),
156            ("CD","HD3",1), ("CE","HE2",1), ("CE","HE3",1), ("CE","NZ",1), ("CD","CG",1),
157            ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("H","N",1), ("H2","N",1),
158            ("HZ1","NZ",1), ("HZ2","NZ",1), ("HZ3","NZ",1), ("HXT","OXT",1)
159        ]);
160        m.insert("MET", vec![
161            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
162            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CE","HE1",1), ("CE","HE2",1),
163            ("CE","HE3",1), ("CG","HG2",1), ("CG","HG3",1), ("CG","SD",1), ("CA","N",1),
164            ("H","N",1), ("H2","N",1), ("HXT","OXT",1), ("CE","SD",1)
165        ]);
166        m.insert("PHE", vec![
167            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
168            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","CE1",5), ("CD1","HD1",1),
169            ("CD2","CE2",6), ("CD2","HD2",1), ("CE1","CZ",6), ("CE1","HE1",1), ("CE2","CZ",5),
170            ("CE2","HE2",1), ("CD1","CG",6), ("CD2","CG",5), ("CZ","HZ",1), ("CA","N",1),
171            ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
172        ]);
173        m.insert("PRO", vec![
174            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
175            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD","HD2",1), ("CD","HD3",1),
176            ("CD","CG",1), ("CG","HG2",1), ("CG","HG3",1), ("CA","N",1), ("CD","N",1),
177            ("H","N",1), ("HXT","OXT",1)
178        ]);
179        m.insert("SER", vec![
180            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
181            ("CB","HB2",1), ("CB","HB3",1), ("CB","OG",1), ("CA","N",1), ("H","N",1),
182            ("H2","N",1), ("HG","OG",1), ("HXT","OXT",1)
183        ]);
184        m.insert("THR", vec![
185            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
186            ("CB","CG2",1), ("CB","HB",1), ("CB","OG1",1), ("CG2","HG21",1), ("CG2","HG22",1),
187            ("CG2","HG23",1), ("CA","N",1), ("H","N",1), ("H2","N",1), ("HG1","OG1",1),
188            ("HXT","OXT",1)
189        ]);
190        m.insert("TRP", vec![
191            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
192            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","HD1",1), ("CD1","NE1",5),
193            ("CD2","CE2",6), ("CD2","CE3",5), ("CE2","CZ2",5), ("CE3","CZ3",6), ("CE3","HE3",1),
194            ("CD1","CG",6), ("CD2","CG",5), ("CH2","HH2",1), ("CH2","CZ2",6), ("CZ2","HZ2",1),
195            ("CH2","CZ3",5), ("CZ3","HZ3",1), ("CA","N",1), ("H","N",1), ("H2","N",1),
196            ("CE2","NE1",5), ("HE1","NE1",1), ("HXT","OXT",1)
197        ]);
198        m.insert("TYR", vec![
199            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
200            ("CB","CG",1), ("CB","HB2",1), ("CB","HB3",1), ("CD1","CE1",5), ("CD1","HD1",1),
201            ("CD2","CE2",6), ("CD2","HD2",1), ("CE1","CZ",6), ("CE1","HE1",1), ("CE2","CZ",5),
202            ("CE2","HE2",1), ("CD1","CG",6), ("CD2","CG",5), ("CZ","OH",1), ("CA","N",1),
203            ("H","N",1), ("H2","N",1), ("HH","OH",1), ("HXT","OXT",1)
204        ]);
205        m.insert("VAL", vec![
206            ("C","O",2), ("C","OXT",1), ("C","CA",1), ("CA","CB",1), ("CA","HA",1),
207            ("CB","CG1",1), ("CB","CG2",1), ("CB","HB",1), ("CG1","HG11",1), ("CG1","HG12",1),
208            ("CG1","HG13",1), ("CG2","HG21",1), ("CG2","HG22",1), ("CG2","HG23",1),
209            ("CA","N",1), ("H","N",1), ("H2","N",1), ("HXT","OXT",1)
210        ]);
211        m
212    })
213}
214
215static AMINO_ACIDS: OnceLock<HashSet<&'static str>> = OnceLock::new();
216static CARBOHYDRATES: OnceLock<HashSet<&'static str>> = OnceLock::new();
217static NUCLEOTIDES: OnceLock<HashSet<&'static str>> = OnceLock::new();
218
219fn get_amino_acids() -> &'static HashSet<&'static str> {
220    AMINO_ACIDS.get_or_init(|| include_str!("ccddata/amino_acids.txt").lines().collect())
221}
222
223fn get_carbohydrates() -> &'static HashSet<&'static str> {
224    CARBOHYDRATES.get_or_init(|| include_str!("ccddata/carbohydrates.txt").lines().collect())
225}
226
227fn get_nucleotides() -> &'static HashSet<&'static str> {
228    NUCLEOTIDES.get_or_init(|| include_str!("ccddata/nucleotides.txt").lines().collect())
229}
230
231pub(crate) fn is_amino_acid(symbol: &str) -> bool {
232    get_amino_acids().contains(symbol)
233}
234
235pub(crate) fn is_carbohydrate(symbol: &str) -> bool {
236    get_carbohydrates().contains(symbol)
237}
238
239pub(crate) fn is_nucleotide(symbol: &str) -> bool {
240    get_nucleotides().contains(symbol)
241}
242
243#[cfg(test)]
244mod tests {
245    use super::*;
246
247    #[test]
248    fn test_residue_checking() {
249        assert!(is_amino_acid("ALA"));
250        assert!(is_amino_acid("ARG"));
251        assert!(!is_amino_acid("ZZZ"));
252
253        assert!(is_carbohydrate("045"));
254        assert!(is_carbohydrate("05L"));
255        assert!(!is_carbohydrate("ZZZ"));
256
257        assert!(is_nucleotide("02I"));
258        assert!(is_nucleotide("05A"));
259        assert!(!is_nucleotide("ZZZ"));
260    }
261}