Objective :
The aim of this post is to create a scheduled job that can run on big data from the Chembl database on a nightly basis and generate useful reports and graphs that can be used by researchers for further detailed analysis and discovery of new drugs.
Architecture :
This job is based upon the https://docs.celeryproject.org/ framework , running on top of RabbitMQ for scalability. When triggered on a nightly basis it picks up new compounds or target proteins added into Chembl database and generates useful reports which are then uploaded to a s3 bucket for Clinicians and Researchers to pour over.
Architecture Diagram TBD
Code Structure :
Workflow file
Create a file called process.py with the following contents
import os
print('------------------------------Starting phase 1 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book1.py')
print('------------------------------Completed phase 1 execution ---------------------------------------')
print('------------------------------Starting phase 2 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book2.py')
print('------------------------------Completed phase 2 execution ---------------------------------------')
print('------------------------------Starting phase 3 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book3.py')
print('------------------------------Completed phase 3 execution ---------------------------------------')
print('------------------------------Starting phase 4 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book4.py')
print('------------------------------Completed phase 4 execution ---------------------------------------')
print('------------------------------Starting phase 5 execution ---------------------------------------')
os.system('python3 /root/biopython/dataprofessor-scripts/book5.py')
print('------------------------------Completed phase 5 execution ---------------------------------------')
Script to perform Data Collection and Pre-Processing from the ChEMBL Database – (book1/process1)
# Import necessary libraries
import pandas as pd
from chembl_webresource_client.new_client import new_client
# Target search for coronavirus
target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
print(f'targets : {targets}')
selected_target = targets.target_chembl_id[4]
print(f'selected_target : {selected_target}')
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df.head(3)
df.to_csv('bioactivity_data_raw.csv', index=False)
df2 = df[df.standard_value.notna()]
print(f'df2 : {df2}')
bioactivity_class = []
for i in df2.standard_value:
if float(i) >= 10000:
bioactivity_class.append("inactive")
elif float(i) <= 1000:
bioactivity_class.append("active")
#else:
# bioactivity_class.append("intermediate")
selection = ['molecule_chembl_id','canonical_smiles','standard_value']
df3 = df2[selection]
print(f'df3 : {df3}')
bioactivity_class = pd.Series(bioactivity_class, name='bioactivity_class')
df4 = pd.concat([df3, bioactivity_class], axis=1)
print(f'df4 : {df4}')
df4.to_csv('bioactivity_data_preprocessed.csv', index=False)
Script for performing Descriptor Calculation and Exploratory Data Analysis – (book2/process2)
import pandas as pd
import os
df = pd.read_csv('bioactivity_data_preprocessed.csv')
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
# Inspired by: https://codeocean.com/explore/capsules?query=tag:data-curation
def lipinski(smiles, verbose=False):
moldata= []
for elem in smiles:
mol=Chem.MolFromSmiles(elem)
moldata.append(mol)
baseData= np.arange(1,1)
i=0
for mol in moldata:
desc_MolWt = Descriptors.MolWt(mol)
desc_MolLogP = Descriptors.MolLogP(mol)
desc_NumHDonors = Lipinski.NumHDonors(mol)
desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
row = np.array([desc_MolWt,
desc_MolLogP,
desc_NumHDonors,
desc_NumHAcceptors])
if(i==0):
baseData=row
else:
baseData=np.vstack([baseData, row])
i=i+1
columnNames=["MW","LogP","NumHDonors","NumHAcceptors"]
descriptors = pd.DataFrame(data=baseData,columns=columnNames)
return descriptors
df_lipinski = lipinski(df.canonical_smiles)
print(f'df_lipinski : {df_lipinski}')
print(f'df : {df}')
df_combined = pd.concat([df,df_lipinski], axis=1)
print(f'df_combined : {df_combined}')
# https://github.com/chaninlab/estrogen-receptor-alpha-qsar/blob/master/02_ER_alpha_RO5.ipynb
import numpy as np
def pIC50(input):
pIC50 = []
for i in input['standard_value_norm']:
molar = i*(10**-9) # Converts nM to M
pIC50.append(-np.log10(molar))
input['pIC50'] = pIC50
x = input.drop('standard_value_norm', 1)
return x
df_combined.standard_value.describe()
-np.log10( (10**-9)* 100000000 )
-np.log10( (10**-9)* 10000000000 )
def norm_value(input):
norm = []
for i in input['standard_value']:
if i > 100000000:
i = 100000000
norm.append(i)
input['standard_value_norm'] = norm
x = input.drop('standard_value', 1)
return x
df_norm = norm_value(df_combined)
print(df_norm)
df_norm.standard_value_norm.describe()
df_final = pIC50(df_norm)
print(f'df_final : {df_final}')
df_final.pIC50.describe()
df_2class = df_final[df_final.bioactivity_class != 'intermediate']
print(f'df_2class : {df_2class}')
import seaborn as sns
sns.set(style='ticks')
import matplotlib.pyplot as plt
plt.figure(figsize=(5.5, 5.5))
sns.countplot(x='bioactivity_class', data=df_2class, edgecolor='black')
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('Frequency', fontsize=14, fontweight='bold')
plt.savefig('plot_bioactivity_class.pdf')
plt.figure(figsize=(5.5, 5.5))
sns.scatterplot(x='MW', y='LogP', data=df_2class, hue='bioactivity_class', size='pIC50', edgecolor='black', alpha=0.7)
plt.xlabel('MW', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0)
plt.savefig('plot_MW_vs_LogP.pdf')
plt.figure(figsize=(5.5, 5.5))
sns.boxplot(x = 'bioactivity_class', y = 'pIC50', data = df_2class)
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('pIC50 value', fontsize=14, fontweight='bold')
plt.savefig('plot_ic50.pdf')
def mannwhitney(descriptor, verbose=False):
# https://machinelearningmastery.com/nonparametric-statistical-significance-tests-in-python/
from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu
# seed the random number generator
seed(1)
# actives and inactives
selection = [descriptor, 'bioactivity_class']
df = df_2class[selection]
active = df[df.bioactivity_class == 'active']
active = active[descriptor]
selection = [descriptor, 'bioactivity_class']
df = df_2class[selection]
inactive = df[df.bioactivity_class == 'inactive']
inactive = inactive[descriptor]
# compare samples
stat, p = mannwhitneyu(active, inactive)
#print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
interpretation = 'Same distribution (fail to reject H0)'
else:
interpretation = 'Different distribution (reject H0)'
results = pd.DataFrame({'Descriptor':descriptor,
'Statistics':stat,
'p':p,
'alpha':alpha,
'Interpretation':interpretation}, index=[0])
filename = 'mannwhitneyu_' + descriptor + '.csv'
results.to_csv(filename)
return results
print(f"mannwhitney('pIC50') : {mannwhitney('pIC50')}")
plt.figure(figsize=(5.5, 5.5))
sns.boxplot(x = 'bioactivity_class', y = 'MW', data = df_2class)
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('MW', fontsize=14, fontweight='bold')
plt.savefig('plot_MW.pdf')
print(f"mannwhitney('MW') : {mannwhitney('MW')}")
plt.figure(figsize=(5.5, 5.5))
sns.boxplot(x = 'bioactivity_class', y = 'LogP', data = df_2class)
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('LogP', fontsize=14, fontweight='bold')
plt.savefig('plot_LogP.pdf')
print(f"mannwhitney('LogP') : {mannwhitney('LogP')}")
plt.figure(figsize=(5.5, 5.5))
sns.boxplot(x = 'bioactivity_class', y = 'NumHDonors', data = df_2class)
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHDonors', fontsize=14, fontweight='bold')
plt.savefig('plot_NumHDonors.pdf')
print(f"mannwhitney('NumHDonors') : {mannwhitney('NumHDonors')}")
plt.figure(figsize=(5.5, 5.5))
sns.boxplot(x = 'bioactivity_class', y = 'NumHAcceptors', data = df_2class)
plt.xlabel('Bioactivity class', fontsize=14, fontweight='bold')
plt.ylabel('NumHAcceptors', fontsize=14, fontweight='bold')
plt.savefig('plot_NumHAcceptors.pdf')
print(f"mannwhitney('NumHAcceptors') : {mannwhitney('NumHAcceptors')}")
os.system("zip -r results.zip . -i *.csv *.pdf")
Script for calculating molecular descriptors that are quantitative description of the compounds in the dataset and preparing dataset for subsequent model building – (book3/process3)
import pandas as pd
import os
import subprocess
import sys
os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.zip")
os.system("wget https://github.com/dataprofessor/bioinformatics/raw/master/padel.sh")
os.system("unzip padel.zip")
os.system("wget https://raw.githubusercontent.com/dataprofessor/data/master/acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv")
df3 = pd.read_csv('acetylcholinesterase_04_bioactivity_data_3class_pIC50.csv')
print(f'df3 : {df3}')
selection = ['canonical_smiles','molecule_chembl_id']
df3_selection = df3[selection]
df3_selection.to_csv('molecule.smi', sep='\t', index=False, header=False)
print("cat molecule.smi | head -5 : ")
cmd = 'cat molecule.smi'
cmdnext = 'head -5'
p1 = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
p2 = subprocess.Popen(cmdnext,shell=True, stdin=p1.stdout, stdout=subprocess.PIPE)
output = p2.communicate()[0]
print(output)
print("cat molecule.smi | wc -l : ")
os.system("cat molecule.smi | wc -l")
print("cat padel.sh : ")
os.system("cat padel.sh")
print("bash padel.sh : ")
os.system("bash padel.sh")
print("ls -l : ")
os.system("ls -l")
df3_X = pd.read_csv('descriptors_output.csv')
print(f'df3_X : {df3_X}')
df3_X = df3_X.drop(columns=['Name'])
print(f'df3_X : {df3_X}')
df3_Y = df3['pIC50']
print(f'df3_Y : {df3_Y}')
dataset3 = pd.concat([df3_X,df3_Y], axis=1)
print(f'dataset3 : {dataset3}')
dataset3.to_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)
Script for building a regression model of acetylcholinesterase or Coronavirus (or any protein for that matter) inhibitors using the random forest algorithm – (book4/process4)
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import os
os.system("wget https://github.com/dataprofessor/data/raw/master/acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv")
df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')
X = df.drop('pIC50', axis=1)
print(f'X : {X}')
Y = df.pIC50
print(f'Y : {Y}')
X.shape
print(f'X.shape : {X.shape}')
print(f'Y.shape : {Y.shape}')
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)
print(f'X.shape : {X.shape}')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)
model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, Y_train)
r2 = model.score(X_test, Y_test)
print(f'r2 : {r2}')
Y_pred = model.predict(X_test)
print(f'Y_pred : {Y_pred}')
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
sns.set_style("white")
ax = sns.regplot(Y_test, Y_pred, scatter_kws={'alpha':0.4})
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 12)
ax.set_ylim(0, 12)
ax.figure.set_size_inches(5, 5)
plt.show
plt.savefig('pIC50_graph.pdf')
Script for comparing several ML algorithms for building regression models of acetylcholinesterase inhibitors – (book5/process5)
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor
df = pd.read_csv('acetylcholinesterase_06_bioactivity_data_3class_pIC50_pubchem_fp.csv')
X = df.drop('pIC50', axis=1)
Y = df.pIC50
# Examine X dimension
print(f'X.shape {X.shape}')
# Remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))
X = selection.fit_transform(X)
print(f'X.shape {X.shape}')
# Perform data splitting using 80/20 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# Defines and builds the lazyclassifier
clf = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
models_train,predictions_train = clf.fit(X_train, X_train, Y_train, Y_train)
models_test,predictions_test = clf.fit(X_train, X_test, Y_train, Y_test)
# Performance table of the training set (80% subset)
print(predictions_train)
# Performance table of the test set (20% subset)
print(predictions_test)
# Bar plot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns
#train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0] ]
plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="R-Squared", data=predictions_train)
ax.set(xlim=(0, 1))
plt.savefig('rsquared.pdf')
# Bar plot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="RMSE", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('rsme.pdf')
# Bar plot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(25, 30))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=predictions_train.index, x="Time Taken", data=predictions_train)
ax.set(xlim=(0, 10))
plt.savefig('bar_calculation_time.pdf')
High Level code structure

Run the workflow with the command
~/biopython/dataprofessor-scripts/output# python3 ../process.py > output.log
Observed output :


Time taken to complete the script end to end : 11 mins
List of files generated as output :

Build a Celery based framework for processing the 5 tasks / jobs detailed in the previous sections.
Celery is distributed parallel processing framework that is based upon RabbitMQ or Redis based infrastructure for executing tasks.More information on the framework can be found at : https://docs.celeryproject.org/en/stable/userguide/index.html
Inorder to setup the infrastructure for running celery spin up a rabbitmq and redis container using docker
docker run -d -p 5672:5672 rabbitmq
docker run -d -p 6379:6379 redis

Next create a project in pycharm with a file called tasks.py. This file will setup the rabbitmq broker which is required for celery and a results backend database using redis.
from celery import Celery
app = Celery('tasks', broker='pyamqp://guest@localhost//', backend='redis://')
#tasks appear here, please see after groups and chord below
Use the following command to startup the celery worker
celery -A tasks worker --loglevel=INFO

Make the 5 process scripts detailed above object oriented
For this all we shall be doing is modifying each of the 5 dot py files above to convert them into a class with init method as shown in the diagram below. Feel free to split the code into separate functions so that its code complexity is reduced. In the section marked in red below just copy over the code from each process above.

Create a celery chain to execute the 5 processes/tasks/books/classes in a chain
Create a chain.py file with the following contents.
from celery import chain
from tasks import process1, process2, process3, process4, process5
result = chain(process1.s(), process2.s(), process3.s(), process4.s(), process5.s()).apply_async()
print(result.ready())
print(result.successful())
print(result.get())
The above 5 processes unfortunately are not parallelizable, since the result of one process is dependent upon that of the other. But if they were then a more scalable option would be to use group or chord as shown in the scripts below
from celery import group
from tasks import process1, process2, process3, process4, process5
job = group([process1.s(), process2.s(), process3.s(), process4.s(), process5.s(),])
result = job.apply_async()
print(result.ready())
print(result.successful())
print(result.get())
from celery import chord, group
from tasks import process1, process2, process3, process4, process5, add, tsum
callback = processes.s()
header = [processi.s() for i in range(5)]
result = chord(header)(callback)
result.get()
Finally our tasks.py file looks like below
from celery import Celery
from book1 import Book1
from celery.schedules import crontab
app = Celery('tasks', broker='pyamqp://guest@localhost//', backend='redis://')
@app.task
def process1():
return Book1.execute_process1()
@app.task
def process2():
return Book2.execute_process2()
@app.task
def process3():
return Book3.execute_process3()
@app.task
def process4():
return Book4.execute_process4()
@app.task
def process5():
return Book5.execute_process5()
@app.task
def processes():
return processi()
app.conf.beat_schedule = {
"nightly-task": {
"task": "tasks.job",
"schedule": crontab(hour=0)
}
}
start celery beat using the command
celery -A tasks beat --loglevel=INFO
Conclusion:
We havent concluded yet.
- Upload the pdf files and reports to s3 bucket in cloud.