Uniform Manifold Approximation and Projection (UMAP) uses the nearest neighbour graph to perform dimesion reduction for visualization. I have been recently using UMAP to visualize the chemical space coverage of compounds. This piece of code computes UMAP for SARS-CoV-2 Database in ChEMBL consisting of 6900 compounds (downloded in November 28, 2020) as an example. Please modify the path used to suit your own directory when using the same csv file, which could be downloaded from https://www.ebi.ac.uk/chembl/g/#browse/compounds/filter/_metadata.compound_records.src_id%3A52.
#Import modules
import pandas as pd
import numpy as np
from tqdm import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import seaborn as sns
import umap
#Read csv file from SARS-Cov-2 Database in ChEMBL (Downloaded in Nov. 28 2020)
#Downloaded from https://www.ebi.ac.uk/chembl/g/#browse/compounds/filter/_metadata.compound_records.src_id%3A52
df = pd.read_csv("./SARS_CoV_2_ChEMBL_Nov_28.csv", sep=';', engine='python')
df
ChEMBL ID | Name | Synonyms | Type | Max Phase | Molecular Weight | Targets | Bioactivities | AlogP | PSA | ... | Structure Type | Inorganic Flag | Heavy Atoms | HBA Lipinski | HBD Lipinski | #RO5 Violations (Lipinski) | Molecular Weight (Monoisotopic) | Molecular Species | Molecular Formula | Smiles | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CHEMBL2087742 | NaN | NaN | Small molecule | 0 | 402.42 | 16 | 33 | 4.31 | 59.29 | ... | MOL | -1 | 29 | 5 | 1 | 0 | 402.1667 | NEUTRAL | C21H21F3N4O | Cc1c(F)cc(C(=O)NC2CC2)cc1-c1c(F)cn2c(C(C)(C)C)... |
1 | CHEMBL4075936 | NaN | NaN | Small molecule | 0 | 327.86 | 2 | 4 | 5.32 | 20.31 | ... | MOL | -1 | 23 | 2 | 0 | 1 | 327.1390 | NEUTRAL | C20H22ClNO | O=C(c1ccc(Cl)cc1)N(Cc1ccccc1)C1CCCCC1 |
2 | CHEMBL3348842 | NaN | NaN | Small molecule | 0 | 402.67 | 2 | 2 | 4.59 | 44.29 | ... | MOL | -1 | 29 | 3 | 3 | 0 | 402.3610 | BASE | C26H46N2O | CN[C@@H](C)[C@H]1[C@H](O)C[C@@]2(C)[C@@H]3CC[C... |
3 | CHEMBL1257423 | NaN | NaN | Small molecule | 0 | 418.38 | 9 | 14 | 3.51 | 96.37 | ... | MOL | -1 | 30 | 7 | 3 | 0 | 418.1253 | NEUTRAL | C20H17F3N4O3 | O=C(NCCO)c1cccc(-c2cc(Nc3ccc(OC(F)(F)F)cc3)ncn... |
4 | CHEMBL2182052 | NaN | NaN | Small molecule | 0 | 456.50 | 8 | 10 | 6.25 | 101.66 | ... | MOL | -1 | 34 | 7 | 2 | 1 | 456.1685 | ACID | C27H24N2O5 | Cc1noc(-c2ccc(-c3ccc(CC(=O)O)cc3)cc2)c1NC(=O)O... |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6895 | CHEMBL751 | BROMFENAC SODIUM | AHR-10282B|BROMDAY|BROMFENAC MONOSODIUM SALT|B... | Small molecule | 4 | 356.15 | 5 | 13 | 2.89 | 80.39 | ... | MOL | 0 | 20 | 4 | 3 | 0 | 333.0001 | ACID | C15H11BrNNaO3 | Nc1c(CC(=O)[O-])cccc1C(=O)c1ccc(Br)cc1.[Na+] |
6896 | CHEMBL1236682 | REFAMETINIB | BAY 869766|BAY 8697661|BAY-86-9766|BAY-869766|... | Small molecule | 2 | 572.34 | 282 | 285 | 3.48 | 107.89 | ... | MOL | 0 | 31 | 7 | 4 | 1 | 572.0090 | NEUTRAL | C19H20F3IN2O5S | COc1cc(F)c(F)c(Nc2ccc(I)cc2F)c1NS(=O)(=O)C1(C[... |
6897 | CHEMBL17350 | TRAXOPRODIL | CP-101,606|CP-101606|TRAXOPRODIL | Small molecule | 2 | 327.42 | 22 | 47 | 2.80 | 63.93 | ... | MOL | 0 | 24 | 4 | 3 | 0 | 327.1834 | BASE | C20H25NO3 | C[C@@H]([C@@H](O)c1ccc(O)cc1)N1CCC(O)(c2ccccc2... |
6898 | CHEMBL210651 | APRICITABINE | APRICITABINE | Small molecule | 3 | 229.26 | 5 | 65 | -0.59 | 90.37 | ... | MOL | 0 | 15 | 6 | 3 | 0 | 229.0521 | NEUTRAL | C8H11N3O3S | Nc1ccn([C@H]2CO[C@@H](CO)S2)c(=O)n1 |
6899 | CHEMBL572 | NITROFURANTOIN | BERKFURIN|CEDURAN|Dantafur|FURADANTIN|FURALAN|... | Small molecule | 4 | 238.16 | 207 | 3101 | 0.07 | 118.05 | ... | MOL | 0 | 17 | 9 | 1 | 0 | 238.0338 | NEUTRAL | C8H6N4O5 | O=C1CN(/N=C/c2ccc([N+](=O)[O-])o2)C(=O)N1 |
6900 rows × 31 columns
#Remove nan from 'Smiles' column in df
SARS_CoV_2_smiles_filter = [x for x in df.Smiles if str(x) != 'nan']
#Define methods for fingerprints
#Computed using 2048 bits with radius 2
#Convert list of SMILES to list of fingerprints
def fingerprint_list_from_smiles_list(smiles_list, n_bits=2048):
fingerprint_list = []
for smiles in tqdm(smiles_list):
mol = Chem.MolFromSmiles(smiles)
fingerprint_list.append(fingerprint_as_array(mol, n_bits))
return fingerprint_list
def fingerprint_as_array(mol, n_bits=2048):
fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
array = np.zeros((1,), np.int)
DataStructs.ConvertToNumpyArray(fingerprint, array)
return array
#Convert list of SMILES for SARS_CoV_2 to fingerprint
fingerprint_SARS_CoV_2_smiles_filter = fingerprint_list_from_smiles_list(SARS_CoV_2_smiles_filter)
100%|██████████| 6843/6843 [00:04<00:00, 1503.90it/s]
#Convert list of array to a single array using numpy
fingerprint_array_SARS_CoV_2_smiles_filter = np.array(fingerprint_SARS_CoV_2_smiles_filter)
#Compute UMAP on the SARS_CoV_2 database
umap = umap.UMAP()
umap_fingerprint_array_SARS_CoV_2_smiles_filter = umap.fit_transform(fingerprint_array_SARS_CoV_2_smiles_filter)
#Place the UMAP result into a pandas dataframe for plotting
umap_fingerprint_array_SARS_CoV_2_smiles_filter_fig = pd.DataFrame(umap_fingerprint_array_SARS_CoV_2_smiles_filter,columns=["X","Y"])
#Set the figure for seaborn
sns.set(rc={'figure.figsize': (10, 10)})
sns.set(font_scale=1.5)
sns.set_style('whitegrid')
#Visualize the UMAP using seaborn
fig = sns.jointplot(data=umap_fingerprint_array_SARS_CoV_2_smiles_filter_fig,x="X",y="Y", kind="hex", height=10, ratio=5, color="blue")