ML based Scalable Bioinformatics cronjob for Computational Drug Discovery

Objective :

The aim of this post is to create a scheduled job that can run on big data from the Chembl database on a nightly basis and generate useful reports and graphs that can be used by researchers for further detailed analysis and discovery of new drugs.

Architecture :

This job is based upon the https://docs.celeryproject.org/ framework , running on top of RabbitMQ for scalability. When triggered on a nightly basis it picks up new compounds or target proteins added into Chembl database and generates useful reports which are then uploaded to a s3 bucket for Clinicians and Researchers to pour over.

Architecture Diagram TBD

Code Structure :

Workflow file

Create a file called process.py with the following contents

import os

print('------------------------------Starting phase 1 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book1.py')
print('------------------------------Completed phase 1 execution ---------------------------------------')

print('------------------------------Starting phase 2 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book2.py')
print('------------------------------Completed phase 2 execution ---------------------------------------')

print('------------------------------Starting phase 3 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book3.py')
print('------------------------------Completed phase 3 execution ---------------------------------------')

print('------------------------------Starting phase 4 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book4.py')
print('------------------------------Completed phase 4 execution ---------------------------------------')

print('------------------------------Starting phase 5 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book5.py')
print('------------------------------Completed phase 5 execution ---------------------------------------')

Script to perform Data Collection and Pre-Processing from the ChEMBL Database

# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)

print(f'targets : {targets}')


selected_target = targets.target_chembl_id[4]

print(f'selected_target : {selected_target}')


activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df.head(3)
df.to_csv('bioactivity_data_raw.csv', index=False)

df2 = df[df.standard_value.notna()]

print(f'df2 : {df2}')


bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  #else:
  #  bioactivity_class.append("intermediate")
  
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]

print(f'df3 : {df3}')


bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')
df4 = pd.concat([df3, bioactivity_class], axis=1)

print(f'df4 : {df4}')


df4.to_csv('bioactivity_data_preprocessed.csv', index=False)  

Script for performing Descriptor Calculation and Exploratory Data Analysis.

import pandas as pd
import os

df = pd.read_csv('bioactivity_data_preprocessed.csv')

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors
	
df_lipinski = lipinski(df.canonical_smiles)

print(f'df_lipinski : {df_lipinski}')

print(f'df : {df}')

df_combined = pd.concat([df,df_lipinski], axis=1)

print(f'df_combined : {df_combined}')

# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb

import numpy as np

def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

df_combined.standard_value.describe()

-np.log10( (10**-9)* 100000000 )

-np.log10( (10**-9)* 10000000000 )

def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
        
    return x

df_norm = norm_value(df_combined)
print(df_norm)

df_norm.standard_value_norm.describe()

df_final = pIC50(df_norm)
print(f'df_final : {df_final}')

df_final.pIC50.describe()

df_2class = df_final[df_final.bioactivity_class != 'intermediate']
print(f'df_2class : {df_2class}')

import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

plt.figure(figsize=(5.5, 5.5))

sns.countplot(x='bioactivity_class', data=df_2class, edgecolor='black')

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')

plt.savefig('plot_bioactivity_class.pdf')

plt.figure(figsize=(5.5, 5.5))

sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)

plt.xlabel('MW', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.savefig('plot_MW_vs_LogP.pdf')

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')

plt.savefig('plot_ic50.pdf')

def mannwhitney(descriptor, verbose=False):
  # https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
  from numpy.random import seed
  from numpy.random import randn
  from scipy.stats import mannwhitneyu

# seed the random number generator
  seed(1)

# actives and inactives
  selection = [descriptor, 'bioactivity_class']
  df = df_2class[selection]
  active = df[df.bioactivity_class == 'active']
  active = active[descriptor]

  selection = [descriptor, 'bioactivity_class']
  df = df_2class[selection]
  inactive = df[df.bioactivity_class == 'inactive']
  inactive = inactive[descriptor]

# compare samples
  stat, p = mannwhitneyu(active, inactive)
  #print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
  alpha = 0.05
  if p > alpha:
    interpretation = 'Same distribution (fail to reject H0)'
  else:
    interpretation = 'Different distribution (reject H0)'
  
  results = pd.DataFrame({'Descriptor':descriptor,
                          'Statistics':stat,
                          'p':p,
                          'alpha':alpha,
                          'Interpretation':interpretation}, index=[0])
  filename = 'mannwhitneyu_' + descriptor + '.csv'
  results.to_csv(filename)

  return results
  
print(f"mannwhitney('pIC50') : {mannwhitney('pIC50')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'MW', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('MW', fontsize=14, fontweight='bold')

plt.savefig('plot_MW.pdf')

print(f"mannwhitney('MW') : {mannwhitney('MW')}")


plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')

plt.savefig('plot_LogP.pdf')


print(f"mannwhitney('LogP') : {mannwhitney('LogP')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHDonors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHDonors.pdf')


print(f"mannwhitney('NumHDonors') : {mannwhitney('NumHDonors')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHAcceptors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHAcceptors.pdf')


print(f"mannwhitney('NumHAcceptors') : {mannwhitney('NumHAcceptors')}")

os.system("zip -r results.zip . -i *.csv *.pdf")

Script for calculating molecular descriptors that are quantitative description of the compounds in the dataset and preparing dataset for subsequent model building

import pandas as pd
import os
import subprocess
import sys


os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip")
os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh")
os.system("unzip padel.zip")
os.system("wget https://raw.githubusercontent.com/dataprofessor/data/master/acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv")

df3 = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')
print(f'df3 : {df3}')

selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

print("cat molecule.smi | head -5 : ")
cmd = 'cat molecule.smi'
cmdnext = 'head -5' 
p1 = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
p2 = subprocess.Popen(cmdnext,shell=True, stdin=p1.stdout, stdout=subprocess.PIPE)
output = p2.communicate()[0]
print(output)

print("cat molecule.smi | wc -l : ")
os.system("cat molecule.smi | wc -l")

print("cat padel.sh : ")
os.system("cat padel.sh")

print("bash padel.sh : ")
os.system("bash padel.sh")

print("ls -l : ")
os.system("ls -l")

df3_X = pd.read_csv('descriptors_output.csv')

print(f'df3_X : {df3_X}')

df3_X = df3_X.drop(columns=['Name'])

print(f'df3_X : {df3_X}')

df3_Y = df3['pIC50']
print(f'df3_Y : {df3_Y}')

dataset3 = pd.concat([df3_X,df3_Y], axis=1)
print(f'dataset3 : {dataset3}')

dataset3.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

Script for building a regression model of acetylcholinesterase or Coronavirus (or any protein for that matter) inhibitors using the random forest algorithm.

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os

os.system("wget https://github.com/dataprofessor/data/raw/master/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")

df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

X = df.drop('pIC50', axis=1)
print(f'X : {X}')

Y = df.pIC50
print(f'Y : {Y}')

X.shape
print(f'X.shape : {X.shape}')

print(f'Y.shape : {Y.shape}')

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X = selection.fit_transform(X)

print(f'X.shape : {X.shape}')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(X_train.shape, Y_train.shape)

print(X_test.shape, Y_test.shape)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
print(f'r2 : {r2}')

Y_pred = model.predict(X_test)
print(f'Y_pred : {Y_pred}')

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)
plt.show
plt.savefig('pIC50_graph.pdf')

Script for comparing several ML algorithms for building regression models of acetylcholinesterase inhibitors.

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

X = df.drop('pIC50', axis=1)
Y = df.pIC50

# Examine X dimension
print(f'X.shape {X.shape}')

# Remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X = selection.fit_transform(X)
print(f'X.shape {X.shape}')

# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Defines and builds the lazyclassifier
clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)

# Performance table of the training set (80% subset)
print(predictions_train)

# Performance table of the test set (20% subset)
print(predictions_test)

# Bar plot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns

#train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0] ]

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="R-Squared", data=predictions_train)
ax.set(xlim=(0, 1))
plt.savefig('rsquared.pdf')

# Bar plot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="RMSE", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('rsme.pdf')

# Bar plot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="Time Taken", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('bar_calculation_time.pdf')

High Level code structure

Run the workflow with the command

~/biopython/dataprofessor-scripts/output# python3 ../process.py > output.log

Observed output :

Time taken to complete the script end to end : 11 mins

List of files generated as output :

Conclusion:

We havent concluded yet. This is just the beginning. The following tasks are pending

  1. Cleanup all scripts and make them object oriented
  2. Build the celery framework with RabbitMQ and trigger the tasks from the celery scheduler.
  3. Upload the pdf files and reports to s3 bucket in cloud.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

%d bloggers like this: