ferritin_plms/esm/utils/constants/
esm3.rs

1// use cached::proc_macro::cached;
2// use huggingface_hub::snapshot_download;
3// use std::env;
4// use std::path::PathBuf;
5
6const SEQUENCE_BOS_TOKEN: i32 = 0;
7const SEQUENCE_PAD_TOKEN: i32 = 1;
8const SEQUENCE_EOS_TOKEN: i32 = 2;
9const SEQUENCE_CHAINBREAK_TOKEN: i32 = 31;
10const SEQUENCE_MASK_TOKEN: i32 = 32;
11const VQVAE_CODEBOOK_SIZE: i32 = 4096;
12
13// lazy_static! {
14//     static ref VQVAE_SPECIAL_TOKENS: std::collections::HashMap<&'static str, i32> = {
15//         let mut m = std::collections::HashMap::new();
16//         m.insert("MASK", VQVAE_CODEBOOK_SIZE);
17//         m.insert("EOS", VQVAE_CODEBOOK_SIZE + 1);
18//         m.insert("BOS", VQVAE_CODEBOOK_SIZE + 2);
19//         m.insert("PAD", VQVAE_CODEBOOK_SIZE + 3);
20//         m.insert("CHAINBREAK", VQVAE_CODEBOOK_SIZE + 4);
21//         m
22//     };
23// }
24// const VQVAE_DIRECTION_LOSS_BINS: i32 = 16;
25// const VQVAE_PAE_BINS: i32 = 64;
26// const VQVAE_MAX_PAE_BIN: f32 = 31.0;
27// const VQVAE_PLDDT_BINS: i32 = 50;
28
29const STRUCTURE_MASK_TOKEN: i32 = VQVAE_CODEBOOK_SIZE;
30const STRUCTURE_BOS_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 2;
31const STRUCTURE_EOS_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 1;
32const STRUCTURE_PAD_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 3;
33const STRUCTURE_CHAINBREAK_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 4;
34const STRUCTURE_UNDEFINED_TOKEN: i32 = 955;
35
36const SASA_PAD_TOKEN: i32 = 0;
37
38const SS8_PAD_TOKEN: i32 = 0;
39
40const INTERPRO_PAD_TOKEN: i32 = 0;
41
42const RESIDUE_PAD_TOKEN: i32 = 0;
43
44const CHAIN_BREAK_STR: &str = "|";
45
46const SEQUENCE_BOS_STR: &str = "<cls>";
47const SEQUENCE_EOS_STR: &str = "<eos>";
48
49const MASK_STR_SHORT: &str = "_";
50const SEQUENCE_MASK_STR: &str = "<mask>";
51const SASA_MASK_STR: &str = "<unk>";
52const SS8_MASK_STR: &str = "<unk>";
53
54pub const SEQUENCE_VOCAB: &[&str] = &[
55    "<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
56    "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "|", "<mask>",
57];
58
59const SSE_8CLASS_VOCAB: &str = "GHITEBSC";
60const SSE_3CLASS_VOCAB: &str = "HEC";
61
62// lazy_static! {
63//     static ref SSE_8CLASS_TO_3CLASS_MAP: std::collections::HashMap<&'static str, &'static str> = {
64//         let mut m = std::collections::HashMap::new();
65//         m.insert("G", "H");
66//         m.insert("H", "H");
67//         m.insert("I", "H");
68//         m.insert("T", "C");
69//         m.insert("E", "E");
70//         m.insert("B", "E");
71//         m.insert("S", "C");
72//         m.insert("C", "C");
73//         m
74//     };
75// }
76
77const SASA_DISCRETIZATION_BOUNDARIES: &[f32] = &[
78    0.8, 4.0, 9.6, 16.4, 24.5, 32.9, 42.0, 51.5, 61.2, 70.9, 81.6, 93.3, 107.2, 125.4, 151.4,
79];
80
81const MAX_RESIDUE_ANNOTATIONS: i32 = 16;
82
83const TFIDF_VECTOR_SIZE: i32 = 58641;
84
85// #[cached]
86// fn data_root(model: &str) -> PathBuf {
87//     if env::var("INFRA_PROVIDER").is_ok() {
88//         return PathBuf::from("");
89//     }
90
91//     let path = match model {
92//         m if m.starts_with("esm3") => {
93//             snapshot_download("EvolutionaryScale/esm3-sm-open-v1").unwrap()
94//         }
95//         m if m.starts_with("esmc-300") => {
96//             snapshot_download("EvolutionaryScale/esmc-300m-2024-12").unwrap()
97//         }
98//         m if m.starts_with("esmc-600") => {
99//             snapshot_download("EvolutionaryScale/esmc-600m-2024-12").unwrap()
100//         }
101//         _ => panic!("{:?} is an invalid model name", model),
102//     };
103
104//     PathBuf::from(path)
105// }
106
107// lazy_static! {
108//     static ref IN_REPO_DATA_FOLDER: PathBuf =
109//         PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("data");
110// }
111
112// lazy_static! {
113//     static ref INTERPRO_ENTRY: PathBuf = IN_REPO_DATA_FOLDER.join("entry_list_safety_29026.list");
114//     static ref INTERPRO_HIERARCHY: PathBuf = IN_REPO_DATA_FOLDER.join("ParentChildTreeFile.txt");
115//     static ref INTERPRO2GO: PathBuf = IN_REPO_DATA_FOLDER.join("ParentChildTreeFile.txt");
116// }
117
118// const INTERPRO_2ID: &str = "data/tag_dict_4_safety_filtered.json";
119
120// lazy_static! {
121//     static ref LSH_TABLE_PATHS: std::collections::HashMap<&'static str, &'static str> = {
122//         let mut m = std::collections::HashMap::new();
123//         m.insert("8bit", "data/hyperplanes_8bit_58641.npz");
124//         m
125//     };
126// }
127
128// lazy_static! {
129//     static ref KEYWORDS_VOCABULARY: PathBuf =
130//         IN_REPO_DATA_FOLDER.join("keyword_vocabulary_safety_filtered_58641.txt");
131//     static ref KEYWORDS_IDF: PathBuf =
132//         IN_REPO_DATA_FOLDER.join("keyword_idf_safety_filtered_58641.npy");
133// }
134
135// const RESID_CSV: &str = "data/uniref90_and_mgnify90_residue_annotations_gt_1k_proteins.csv";
136
137// lazy_static! {
138//     static ref INTERPRO2KEYWORDS: PathBuf =
139//         IN_REPO_DATA_FOLDER.join("interpro_29026_to_keywords_58641.csv");
140// }