ML based Scalable Bioinformatics cronjob for Computational Drug Discovery

Objective :

The aim of this post is to create a scheduled job that can run on big data from the Chembl database on a nightly basis and generate useful reports and graphs that can be used by researchers for further detailed analysis and discovery of new drugs.

Architecture :

This job is based upon the https://docs.celeryproject.org/ framework , running on top of RabbitMQ for scalability. When triggered on a nightly basis it picks up new compounds or target proteins added into Chembl database and generates useful reports which are then uploaded to a s3 bucket for Clinicians and Researchers to pour over.

Architecture Diagram TBD

Code Structure :

Workflow file

Create a file called process.py with the following contents

import os

print('------------------------------Starting phase 1 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book1.py')
print('------------------------------Completed phase 1 execution ---------------------------------------')

print('------------------------------Starting phase 2 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book2.py')
print('------------------------------Completed phase 2 execution ---------------------------------------')

print('------------------------------Starting phase 3 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book3.py')
print('------------------------------Completed phase 3 execution ---------------------------------------')

print('------------------------------Starting phase 4 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book4.py')
print('------------------------------Completed phase 4 execution ---------------------------------------')

print('------------------------------Starting phase 5 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book5.py')
print('------------------------------Completed phase 5 execution ---------------------------------------')

Script to perform Data Collection and Pre-Processing from the ChEMBL Database – (book1/process1)

# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)

print(f'targets : {targets}')


selected_target = targets.target_chembl_id[4]

print(f'selected_target : {selected_target}')


activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df.head(3)
df.to_csv('bioactivity_data_raw.csv', index=False)

df2 = df[df.standard_value.notna()]

print(f'df2 : {df2}')


bioactivity_class = []
for i in df2.standard_value:
  if float(i) >= 10000:
    bioactivity_class.append("inactive")
  elif float(i) <= 1000:
    bioactivity_class.append("active")
  #else:
  #  bioactivity_class.append("intermediate")
  
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]

print(f'df3 : {df3}')


bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')
df4 = pd.concat([df3, bioactivity_class], axis=1)

print(f'df4 : {df4}')


df4.to_csv('bioactivity_data_preprocessed.csv', index=False)

Script for performing Descriptor Calculation and Exploratory Data Analysis – (book2/process2)

import pandas as pd
import os

df = pd.read_csv('bioactivity_data_preprocessed.csv')

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation

def lipinski(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem) 
        moldata.append(mol)
       
    baseData= np.arange(1,1)
    i=0  
    for mol in moldata:        
       
        desc_MolWt = Descriptors.MolWt(mol)
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_NumHDonors = Lipinski.NumHDonors(mol)
        desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
           
        row = np.array([desc_MolWt,
                        desc_MolLogP,
                        desc_NumHDonors,
                        desc_NumHAcceptors])   
    
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1      
    
    columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]   
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors
	
df_lipinski = lipinski(df.canonical_smiles)

print(f'df_lipinski : {df_lipinski}')

print(f'df : {df}')

df_combined = pd.concat([df,df_lipinski], axis=1)

print(f'df_combined : {df_combined}')

# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb

import numpy as np

def pIC50(input):
    pIC50 = []

    for i in input['standard_value_norm']:
        molar = i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))

    input['pIC50'] = pIC50
    x = input.drop('standard_value_norm', 1)
        
    return x

df_combined.standard_value.describe()

-np.log10( (10**-9)* 100000000 )

-np.log10( (10**-9)* 10000000000 )

def norm_value(input):
    norm = []

    for i in input['standard_value']:
        if i > 100000000:
          i = 100000000
        norm.append(i)

    input['standard_value_norm'] = norm
    x = input.drop('standard_value', 1)
        
    return x

df_norm = norm_value(df_combined)
print(df_norm)

df_norm.standard_value_norm.describe()

df_final = pIC50(df_norm)
print(f'df_final : {df_final}')

df_final.pIC50.describe()

df_2class = df_final[df_final.bioactivity_class != 'intermediate']
print(f'df_2class : {df_2class}')

import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt

plt.figure(figsize=(5.5, 5.5))

sns.countplot(x='bioactivity_class', data=df_2class, edgecolor='black')

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')

plt.savefig('plot_bioactivity_class.pdf')

plt.figure(figsize=(5.5, 5.5))

sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)

plt.xlabel('MW', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.savefig('plot_MW_vs_LogP.pdf')

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')

plt.savefig('plot_ic50.pdf')

def mannwhitney(descriptor, verbose=False):
  # https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
  from numpy.random import seed
  from numpy.random import randn
  from scipy.stats import mannwhitneyu

# seed the random number generator
  seed(1)

# actives and inactives
  selection = [descriptor, 'bioactivity_class']
  df = df_2class[selection]
  active = df[df.bioactivity_class == 'active']
  active = active[descriptor]

  selection = [descriptor, 'bioactivity_class']
  df = df_2class[selection]
  inactive = df[df.bioactivity_class == 'inactive']
  inactive = inactive[descriptor]

# compare samples
  stat, p = mannwhitneyu(active, inactive)
  #print('Statistics=%.3f, p=%.3f' % (stat, p))

# interpret
  alpha = 0.05
  if p > alpha:
    interpretation = 'Same distribution (fail to reject H0)'
  else:
    interpretation = 'Different distribution (reject H0)'
  
  results = pd.DataFrame({'Descriptor':descriptor,
                          'Statistics':stat,
                          'p':p,
                          'alpha':alpha,
                          'Interpretation':interpretation}, index=[0])
  filename = 'mannwhitneyu_' + descriptor + '.csv'
  results.to_csv(filename)

  return results
  
print(f"mannwhitney('pIC50') : {mannwhitney('pIC50')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'MW', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('MW', fontsize=14, fontweight='bold')

plt.savefig('plot_MW.pdf')

print(f"mannwhitney('MW') : {mannwhitney('MW')}")


plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')

plt.savefig('plot_LogP.pdf')


print(f"mannwhitney('LogP') : {mannwhitney('LogP')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHDonors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHDonors.pdf')


print(f"mannwhitney('NumHDonors') : {mannwhitney('NumHDonors')}")

plt.figure(figsize=(5.5, 5.5))

sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = df_2class)

plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHAcceptors', fontsize=14, fontweight='bold')

plt.savefig('plot_NumHAcceptors.pdf')


print(f"mannwhitney('NumHAcceptors') : {mannwhitney('NumHAcceptors')}")

os.system("zip -r results.zip . -i *.csv *.pdf")

Script for calculating molecular descriptors that are quantitative description of the compounds in the dataset and preparing dataset for subsequent model building – (book3/process3)

import pandas as pd
import os
import subprocess
import sys


os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip")
os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh")
os.system("unzip padel.zip")
os.system("wget https://raw.githubusercontent.com/dataprofessor/data/master/acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv")

df3 = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')
print(f'df3 : {df3}')

selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)

print("cat molecule.smi | head -5 : ")
cmd = 'cat molecule.smi'
cmdnext = 'head -5' 
p1 = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
p2 = subprocess.Popen(cmdnext,shell=True, stdin=p1.stdout, stdout=subprocess.PIPE)
output = p2.communicate()[0]
print(output)

print("cat molecule.smi | wc -l : ")
os.system("cat molecule.smi | wc -l")

print("cat padel.sh : ")
os.system("cat padel.sh")

print("bash padel.sh : ")
os.system("bash padel.sh")

print("ls -l : ")
os.system("ls -l")

df3_X = pd.read_csv('descriptors_output.csv')

print(f'df3_X : {df3_X}')

df3_X = df3_X.drop(columns=['Name'])

print(f'df3_X : {df3_X}')

df3_Y = df3['pIC50']
print(f'df3_Y : {df3_Y}')

dataset3 = pd.concat([df3_X,df3_Y], axis=1)
print(f'dataset3 : {dataset3}')

dataset3.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)

Script for building a regression model of acetylcholinesterase or Coronavirus (or any protein for that matter) inhibitors using the random forest algorithm – (book4/process4)

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os

os.system("wget https://github.com/dataprofessor/data/raw/master/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")

df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

X = df.drop('pIC50', axis=1)
print(f'X : {X}')

Y = df.pIC50
print(f'Y : {Y}')

X.shape
print(f'X.shape : {X.shape}')

print(f'Y.shape : {Y.shape}')

from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X = selection.fit_transform(X)

print(f'X.shape : {X.shape}')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

print(X_train.shape, Y_train.shape)

print(X_test.shape, Y_test.shape)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
print(f'r2 : {r2}')

Y_pred = model.predict(X_test)
print(f'Y_pred : {Y_pred}')

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(color_codes=True)
sns.set_style("white")

ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)
plt.show
plt.savefig('pIC50_graph.pdf')

Script for comparing several ML algorithms for building regression models of acetylcholinesterase inhibitors – (book5/process5)

import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')

X = df.drop('pIC50', axis=1)
Y = df.pIC50

# Examine X dimension
print(f'X.shape {X.shape}')

# Remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
X = selection.fit_transform(X)
print(f'X.shape {X.shape}')

# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Defines and builds the lazyclassifier
clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)

# Performance table of the training set (80% subset)
print(predictions_train)

# Performance table of the test set (20% subset)
print(predictions_test)

# Bar plot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns

#train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0] ]

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="R-Squared", data=predictions_train)
ax.set(xlim=(0, 1))
plt.savefig('rsquared.pdf')

# Bar plot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="RMSE", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('rsme.pdf')

# Bar plot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="Time Taken", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('bar_calculation_time.pdf')

High Level code structure

Run the workflow with the command

~/biopython/dataprofessor-scripts/output# python3 ../process.py > output.log

Observed output :

Time taken to complete the script end to end : 11 mins

List of files generated as output :

outputlog Download

bar_calculation_time Download

pic50_graph Download

Build a Celery based framework for processing the 5 tasks / jobs detailed in the previous sections.

Celery is distributed parallel processing framework that is based upon RabbitMQ or Redis based infrastructure for executing tasks.More information on the framework can be found at : https://docs.celeryproject.org/en/stable/userguide/index.html

Inorder to setup the infrastructure for running celery spin up a rabbitmq and redis container using docker

docker run -d -p 5672:5672 rabbitmq
docker run -d -p 6379:6379 redis

Next create a project in pycharm with a file called tasks.py. This file will setup the rabbitmq broker which is required for celery and a results backend database using redis.

from celery import Celery

app = Celery('tasks', broker='pyamqp://guest@localhost//', backend='redis://')

#tasks appear here, please see after groups and chord below

Use the following command to startup the celery worker

celery -A tasks worker --loglevel=INFO

Make the 5 process scripts detailed above object oriented

For this all we shall be doing is modifying each of the 5 dot py files above to convert them into a class with init method as shown in the diagram below. Feel free to split the code into separate functions so that its code complexity is reduced. In the section marked in red below just copy over the code from each process above.

Create a celery chain to execute the 5 processes/tasks/books/classes in a chain

Create a chain.py file with the following contents.

from celery import chain
from tasks import process1, process2, process3, process4, process5


result = chain(process1.s(), process2.s(), process3.s(), process4.s(), process5.s()).apply_async()
print(result.ready())

print(result.successful())

print(result.get())

The above 5 processes unfortunately are not parallelizable, since the result of one process is dependent upon that of the other. But if they were then a more scalable option would be to use group or chord as shown in the scripts below

from celery import group
from tasks import process1, process2, process3, process4, process5

job = group([process1.s(), process2.s(), process3.s(), process4.s(), process5.s(),])

result = job.apply_async()

print(result.ready())

print(result.successful())

print(result.get())

from celery import chord, group
from tasks import process1, process2, process3, process4, process5, add, tsum

callback = processes.s()
header = [processi.s() for i in range(5)]
result = chord(header)(callback)
result.get()

Finally our tasks.py file looks like below

from celery import Celery
from book1 import Book1
from celery.schedules import crontab

app = Celery('tasks', broker='pyamqp://guest@localhost//', backend='redis://')

@app.task
def process1():
    return Book1.execute_process1()

@app.task
def process2():
    return Book2.execute_process2()

@app.task
def process3():
    return Book3.execute_process3()

@app.task
def process4():
    return Book4.execute_process4()

@app.task
def process5():
    return Book5.execute_process5()

@app.task
def processes():
    return processi()

app.conf.beat_schedule = {
    "nightly-task": {
        "task": "tasks.job",
        "schedule": crontab(hour=0)
    }
}

start celery beat using the command

 celery -A tasks beat --loglevel=INFO

Conclusion:

We havent concluded yet.

Upload the pdf files and reports to s3 bucket in cloud.