ferritin_plms/esmc/utils/constants/esm3.rs
1#![allow(dead_code)]
2// use cached::proc_macro::cached;
3// use huggingface_hub::snapshot_download;
4// use std::env;
5// use std::path::PathBuf;
6
7const SEQUENCE_BOS_TOKEN: i32 = 0;
8const SEQUENCE_PAD_TOKEN: i32 = 1;
9const SEQUENCE_EOS_TOKEN: i32 = 2;
10const SEQUENCE_CHAINBREAK_TOKEN: i32 = 31;
11const SEQUENCE_MASK_TOKEN: i32 = 32;
12const VQVAE_CODEBOOK_SIZE: i32 = 4096;
13
14// lazy_static! {
15// static ref VQVAE_SPECIAL_TOKENS: std::collections::HashMap<&'static str, i32> = {
16// let mut m = std::collections::HashMap::new();
17// m.insert("MASK", VQVAE_CODEBOOK_SIZE);
18// m.insert("EOS", VQVAE_CODEBOOK_SIZE + 1);
19// m.insert("BOS", VQVAE_CODEBOOK_SIZE + 2);
20// m.insert("PAD", VQVAE_CODEBOOK_SIZE + 3);
21// m.insert("CHAINBREAK", VQVAE_CODEBOOK_SIZE + 4);
22// m
23// };
24// }
25// const VQVAE_DIRECTION_LOSS_BINS: i32 = 16;
26// const VQVAE_PAE_BINS: i32 = 64;
27// const VQVAE_MAX_PAE_BIN: f32 = 31.0;
28// const VQVAE_PLDDT_BINS: i32 = 50;
29
30const STRUCTURE_MASK_TOKEN: i32 = VQVAE_CODEBOOK_SIZE;
31const STRUCTURE_BOS_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 2;
32const STRUCTURE_EOS_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 1;
33const STRUCTURE_PAD_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 3;
34const STRUCTURE_CHAINBREAK_TOKEN: i32 = VQVAE_CODEBOOK_SIZE + 4;
35const STRUCTURE_UNDEFINED_TOKEN: i32 = 955;
36
37const SASA_PAD_TOKEN: i32 = 0;
38
39const SS8_PAD_TOKEN: i32 = 0;
40
41const INTERPRO_PAD_TOKEN: i32 = 0;
42
43const RESIDUE_PAD_TOKEN: i32 = 0;
44
45const CHAIN_BREAK_STR: &str = "|";
46
47const SEQUENCE_BOS_STR: &str = "<cls>";
48const SEQUENCE_EOS_STR: &str = "<eos>";
49
50const MASK_STR_SHORT: &str = "_";
51const SEQUENCE_MASK_STR: &str = "<mask>";
52const SASA_MASK_STR: &str = "<unk>";
53const SS8_MASK_STR: &str = "<unk>";
54
55pub const SEQUENCE_VOCAB: &[&str] = &[
56 "<cls>", "<pad>", "<eos>", "<unk>", "L", "A", "G", "V", "S", "E", "R", "T", "I", "D", "P", "K",
57 "Q", "N", "F", "Y", "M", "H", "W", "C", "X", "B", "U", "Z", "O", ".", "-", "|", "<mask>",
58];
59
60const SSE_8CLASS_VOCAB: &str = "GHITEBSC";
61const SSE_3CLASS_VOCAB: &str = "HEC";
62
63// lazy_static! {
64// static ref SSE_8CLASS_TO_3CLASS_MAP: std::collections::HashMap<&'static str, &'static str> = {
65// let mut m = std::collections::HashMap::new();
66// m.insert("G", "H");
67// m.insert("H", "H");
68// m.insert("I", "H");
69// m.insert("T", "C");
70// m.insert("E", "E");
71// m.insert("B", "E");
72// m.insert("S", "C");
73// m.insert("C", "C");
74// m
75// };
76// }
77
78const SASA_DISCRETIZATION_BOUNDARIES: &[f32] = &[
79 0.8, 4.0, 9.6, 16.4, 24.5, 32.9, 42.0, 51.5, 61.2, 70.9, 81.6, 93.3, 107.2, 125.4, 151.4,
80];
81
82const MAX_RESIDUE_ANNOTATIONS: i32 = 16;
83
84const TFIDF_VECTOR_SIZE: i32 = 58641;
85
86// #[cached]
87// fn data_root(model: &str) -> PathBuf {
88// if env::var("INFRA_PROVIDER").is_ok() {
89// return PathBuf::from("");
90// }
91
92// let path = match model {
93// m if m.starts_with("esm3") => {
94// snapshot_download("EvolutionaryScale/esm3-sm-open-v1").unwrap()
95// }
96// m if m.starts_with("esmc-300") => {
97// snapshot_download("EvolutionaryScale/esmc-300m-2024-12").unwrap()
98// }
99// m if m.starts_with("esmc-600") => {
100// snapshot_download("EvolutionaryScale/esmc-600m-2024-12").unwrap()
101// }
102// _ => panic!("{:?} is an invalid model name", model),
103// };
104
105// PathBuf::from(path)
106// }
107
108// lazy_static! {
109// static ref IN_REPO_DATA_FOLDER: PathBuf =
110// PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("data");
111// }
112
113// lazy_static! {
114// static ref INTERPRO_ENTRY: PathBuf = IN_REPO_DATA_FOLDER.join("entry_list_safety_29026.list");
115// static ref INTERPRO_HIERARCHY: PathBuf = IN_REPO_DATA_FOLDER.join("ParentChildTreeFile.txt");
116// static ref INTERPRO2GO: PathBuf = IN_REPO_DATA_FOLDER.join("ParentChildTreeFile.txt");
117// }
118
119// const INTERPRO_2ID: &str = "data/tag_dict_4_safety_filtered.json";
120
121// lazy_static! {
122// static ref LSH_TABLE_PATHS: std::collections::HashMap<&'static str, &'static str> = {
123// let mut m = std::collections::HashMap::new();
124// m.insert("8bit", "data/hyperplanes_8bit_58641.npz");
125// m
126// };
127// }
128
129// lazy_static! {
130// static ref KEYWORDS_VOCABULARY: PathBuf =
131// IN_REPO_DATA_FOLDER.join("keyword_vocabulary_safety_filtered_58641.txt");
132// static ref KEYWORDS_IDF: PathBuf =
133// IN_REPO_DATA_FOLDER.join("keyword_idf_safety_filtered_58641.npy");
134// }
135
136// const RESID_CSV: &str = "data/uniref90_and_mgnify90_residue_annotations_gt_1k_proteins.csv";
137
138// lazy_static! {
139// static ref INTERPRO2KEYWORDS: PathBuf =
140// IN_REPO_DATA_FOLDER.join("interpro_29026_to_keywords_58641.csv");
141// }