Silly walks is a metric that measures the degree of silliness for the molecules of interest in a score of 0 (least silly molecule) to 1 (most silly molecule) by comparing against the reference drug molecules from ChEMBL computed using Morgan Fingerprints with radius 2 as defined in PatWalter's GitHub repository (https://github.com/PatWalters/silly_walks).
In this notebook, I will be highlighting the distribution of silly walks for approved drugs from DrugBank (downloaded from drugbank.ca) to highlight its usage.
#from PatWalter's GitHub repository (https://github.com/PatWalters/silly_walks/blob/main/silly_walks.py)
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
class SillyWalks:
def __init__(self, df):
self.count_dict = {}
for smi in df.SMILES:
mol = Chem.MolFromSmiles(smi)
if mol:
fp = AllChem.GetMorganFingerprint(mol, 2)
for k, v in fp.GetNonzeroElements().items():
self.count_dict[k] = self.count_dict.get(k, 0) + v
def score(self, smiles_in):
mol = Chem.MolFromSmiles(smiles_in)
if mol:
fp = AllChem.GetMorganFingerprint(mol, 2)
on_bits = fp.GetNonzeroElements().keys()
silly_bits = [
x for x in [self.count_dict.get(x) for x in on_bits] if x is None
]
score = len(silly_bits) / len(on_bits)
else:
score = 1
return score
#load the reference molecules from ChEMBL (downloaded from: https://github.com/PatWalters/silly_walks/blob/main/chembl_drugs.smi)
ref_df = pd.read_csv("./chembl_drugs.smi", sep=" ", names=['SMILES', 'Name'])
ref_df
SMILES | Name | |
---|---|---|
0 | Br.CC(NC(C)(C)C)C(=O)c1cccc(Cl)c1 | 675686 |
1 | Br.Cc1ccc(Sc2ccccc2N3CCNCC3)c(C)c1 | 1379657 |
2 | Br.CN(C)CCCC1(OCc2cc(ccc12)C#N)c3ccc(F)cc3 | 674732 |
3 | Br.CN1CCC[C@@H]1Cc2c[nH]c3ccc(CCS(=O)(=O)c4ccc... | 674954 |
4 | Br.COc1ccc2CN(C)CC[C@@]34C=C[C@H](O)C[C@@H]3Oc... | 443255 |
... | ... | ... |
1490 | [O-][N+](=O)c1oc(\C=N\N2CCOC2=O)cc1 | 151310 |
1491 | [O-][N+](=O)O[C@H]1CO[C@@H]2[C@@H](CO[C@H]12)O... | 556 |
1492 | [O-][S+](CCC1C(=O)N(N(C1=O)c2ccccc2)c3ccccc3)c... | 60232 |
1493 | [OH-].[OH-].[Mg+2] | 674669 |
1494 | [OH-].[OH-].[OH-].[Al+3] | 674657 |
1495 rows × 2 columns
#load the clinically approved drugs from DrugBank (downloaded from drugbank.ca) with "SMILES" and "Name" only
#only kept SMILES with Drug Groups of "approved"
DrugBank_approved_df = pd.read_csv("./DrugBank_approved_SMILES.csv")
DrugBank_approved_df
SMILES | Name | |
---|---|---|
0 | CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H... | Goserelin |
1 | CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(... | Gramicidin D |
2 | NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)... | Desmopressin |
3 | NCCCCC(NC(=O)C1CCCN1C(=O)C1CSSCC(N)C(=O)NC(CC2... | Vasopressin |
4 | [H][C@]12[C@H](C[C@H](O)C=C1C=C[C@H](C)[C@@H]2... | Pravastatin |
... | ... | ... |
1011 | [Fe+3].OC[C@H](O)[C@@H](O)[C@H](O)[C@H](O)CO[C... | Ferric derisomaltose |
1012 | [H][C@@]12C[C@@H]([18F])[C@H](O)[C@@]1(C)CC[C@... | Fluoroestradiol F-18 |
1013 | [64Cu++].C[C@@H](O)[C@H](NC(=O)[C@@H]1CSSC[C@H... | Copper oxodotreotide Cu-64 |
1014 | NCC1=CC(=CC=C1)N1N=C(C=C1C(=O)NC1=CC(=CC=C1F)[... | Berotralstat |
1015 | [H+].[68Ga+3].OC(=O)CC[C@H](NC(=O)N[C@@H](CCCC... | Gallium Ga-68 gozetotide |
1016 rows × 2 columns
#create new dataframe after computing for silly walks (sorted in lowest to highest silly score)
silly_walks = SillyWalks(ref_df)
DrugBank_approved_df["Silly Walks Score"] = DrugBank_approved_df['SMILES'].apply(silly_walks.score)
DrugBank_approved_df.sort_values("Silly Walks Score", ascending=False, inplace=True)
[14:35:28] Explicit valence for atom # 0 N, 4, is greater than permitted [14:35:28] Explicit valence for atom # 0 N, 4, is greater than permitted [14:35:29] Explicit valence for atom # 84 N, 4, is greater than permitted [14:35:29] WARNING: not removing hydrogen atom without neighbors
DrugBank_approved_df
SMILES | Name | Silly Walks Score | |
---|---|---|---|
770 | [81Kr] | Krypton Kr 81m | 1.0 |
757 | [57Co+3].[C-]#N.C[C@H](CNC(=O)CC[C@]1(C)[C@@H]... | Cyanocobalamin Co-57 | 1.0 |
967 | [177Lu] | Lutetium Lu-177 | 1.0 |
819 | [O--].[Cu++] | Cupric oxide | 1.0 |
820 | [Cr] | Chromium | 1.0 |
... | ... | ... | ... |
565 | COC1=CC=C(C=C1)N1N=C(C(N)=O)C2=C1C(=O)N(CC2)C1... | Apixaban | 0.0 |
566 | COC1=CC(NC2=C(C=NC3=CC(OCCCN4CCN(C)CC4)=C(OC)C... | Bosutinib | 0.0 |
567 | NC1=CC=NC=C1 | Dalfampridine | 0.0 |
569 | NC(=O)C1=CC2=CC(=CC=C2O1)N1CCN(CCCCC2=CNC3=C2C... | Vilazodone | 0.0 |
508 | [H][C@@](C)(CCC(O)=O)[C@@]1([H])CC[C@@]2([H])[... | Cholic Acid | 0.0 |
1016 rows × 3 columns
#plot distribution of values in "Silly Walks Score" for approved drugs from DrugBank in histrogram
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
ax = sns.histplot(DrugBank_approved_df['Silly Walks Score'], kde=False, bins=9)
for p in ax.patches:
ax.annotate(f'{p.get_height():.0f}\n', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='center', color='crimson')
plt.show()
#convert list of SMILES to list of mol to draw sample molecules
def convert_smiles_into_mol(list_name):
new_list = []
for i in list_name:
mol = Chem.MolFromSmiles(i)
new_list.append(mol)
return new_list
#create new dataframe with silly walks = 0.0
DrugBank_approved_df1 = DrugBank_approved_df.loc[DrugBank_approved_df['Silly Walks Score'] == 0.0]
DrugBank_approved_df1
SMILES | Name | Silly Walks Score | |
---|---|---|---|
169 | CNCCC=C1C2=CC=CC=C2CCC2=CC=CC=C12 | Nortriptyline | 0.0 |
168 | NC(N)=N | Guanidine | 0.0 |
781 | [H]\C(CN1CCCC1)=C(\C1=CC=C(C)C=C1)C1=CC=CC(=N1... | Acrivastine | 0.0 |
784 | N[13C](N)=O | Urea C-13 | 0.0 |
121 | CN1CCC(CC1)=C1C2=CC=CC=C2C=CC2=CC=CC=C12 | Cyproheptadine | 0.0 |
... | ... | ... | ... |
565 | COC1=CC=C(C=C1)N1N=C(C(N)=O)C2=C1C(=O)N(CC2)C1... | Apixaban | 0.0 |
566 | COC1=CC(NC2=C(C=NC3=CC(OCCCN4CCN(C)CC4)=C(OC)C... | Bosutinib | 0.0 |
567 | NC1=CC=NC=C1 | Dalfampridine | 0.0 |
569 | NC(=O)C1=CC2=CC(=CC=C2O1)N1CCN(CCCCC2=CNC3=C2C... | Vilazodone | 0.0 |
508 | [H][C@@](C)(CCC(O)=O)[C@@]1([H])CC[C@@]2([H])[... | Cholic Acid | 0.0 |
494 rows × 3 columns
#create new dataframes with silly walks = 0.1, 0.2, 0.3, 0.5
DrugBank_approved_df2 = DrugBank_approved_df.loc[DrugBank_approved_df['Silly Walks Score'] == 0.1]
DrugBank_approved_df3 = DrugBank_approved_df.loc[DrugBank_approved_df['Silly Walks Score'] == 0.2]
DrugBank_approved_df4 = DrugBank_approved_df.loc[DrugBank_approved_df['Silly Walks Score'] == 0.3]
DrugBank_approved_df5 = DrugBank_approved_df.loc[DrugBank_approved_df['Silly Walks Score'] == 0.5]
from rdkit.Chem import Draw
#draw first n molecules of specified dataframe to show examples
def draw_molecules(dataframe, number):
SMILES_list = dataframe['SMILES'].values.tolist()
mol_list = convert_smiles_into_mol(SMILES_list)
image_mol = Draw.MolsToGridImage(mol_list[0:number], subImgSize=(200, 200), returnPNG=False)
return image_mol
#draw example molecules of silly_walks = 0.0
draw_molecules(DrugBank_approved_df1, 3)
#draw example molecules of silly_walks = 0.1
draw_molecules(DrugBank_approved_df2, 3)
#draw example molecules of silly_walks = 0.2
draw_molecules(DrugBank_approved_df3, 3)
#draw example molecules of silly_walks = 0.3
draw_molecules(DrugBank_approved_df4, 2)
#draw example molecules of silly_walks = 0.5
draw_molecules(DrugBank_approved_df5, 3)
Conclusion: As shown from the histogram of silly walks for the approved drugs from DrugBank, approximately 90% of the drugs (912 out of 1016 approved drugs) lie in the region of silly walks between 0.0 to 0.3.