Predicting Cell Effective Permeability with Gradient Boosted Regression

TDC ADMET, Caco-2_Wang Submission

The structure for this model was originally written by Lealia Xiong, and it is used with permission and attribution. Modifications including scaling the y values and tuning the hyperparameters were done by Rob Learsch.

Dataset Description: The human colon epithelial cancer cell line, Caco-2, is used as an in vitro model to simulate the human intestinal tissue. The experimental result on the rate of drug passing through the Caco-2 cells can approximate the rate at which the drug permeates through the human intestinal tissue.

Task Description: Regression. Given a drug SMILES string, predict the Caco-2 cell effective permeability.

Dataset Statistics: 906 drugs.

from typing import Tuple

import numpy as np
import pandas as pd

# cheminformatics
import rdkit.Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

# logging
import tqdm

# data preprocessing
import sklearn.impute
import sklearn.preprocessing

# modeling
import sklearn.ensemble

# metrics
import sklearn.metrics


from tdc.single_pred import ADME
data = ADME(name = 'Caco2_Wang')
split = data.get_split()
Found local copy...
Loading...
Done!

Add features

The features used are the basic chemical descriptors from RDKit.

def add_descriptor_columns(data: pd.DataFrame) -> pd.DataFrame:
    """
    Use rdkit to get descriptors of each drug in the `data` df.
    Return a Pandas DataFrame with the descriptors as columns in the df and .
    """
    
    # Extract the Drug column
    assert 'Drug' in data.columns, "'Drug' must be a column in the input DataFrame."
    drugs = data['Drug']
    y = data['Y']
    
    # Get the descriptors for each drug
    print("Calculating descriptors...")
    descriptors = []
    for drug, target in tqdm.tqdm(zip(drugs, y)):
        descriptor = Descriptors.CalcMolDescriptors(
            rdkit.Chem.MolFromSmiles(drug)
        )
        descriptor['Drug'] = drug
        descriptor['Y'] = target
        descriptors.append(descriptor)

    # Make a dataframe for the descriptors
    df = pd.DataFrame(descriptors)

    return df

Processing is done to impute values for the cells with missing values and to scale both the X-data and y-data separately. Be sure to only train the imputer and scalers on the training data - not the test data.

def preprocess_data(
    data: pd.DataFrame, 
    imputer=sklearn.impute.SimpleImputer(missing_values=np.nan, strategy='mean'),
    fit_imputer=True,
    scaler_X=sklearn.preprocessing.RobustScaler(),
    scaler_y=sklearn.preprocessing.RobustScaler(),
    fit_scaler=True
):
    """
    Imputes missing values.
    Scales feature data.

    Returns a tuple X, y of scaled feature data and target data.
    """

    col_array = np.array(data.columns)

    # extract just the feature data
    X = data[col_array[~np.isin(col_array, ['Drug_ID', 'Drug', 'Y'])]].to_numpy()
    
    # extract the target data
    y = np.array(data['Y']).reshape(-1,1)
    
    # impute missing data
    if imputer is not None:
        if fit_imputer:
            X = imputer.fit_transform(X)
        else:
            X = imputer.transform(X)

    # scale the feature data
    if scaler_X is not None:
        if fit_scaler:
            X = scaler_X.fit_transform(X)
            y = scaler_y.fit_transform(y)
        else:
            X = scaler_X.transform(X)
            y = scaler_y.transform(y)



    return X, y, imputer, scaler_X, scaler_y
regr = sklearn.ensemble.GradientBoostingRegressor()
X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(add_descriptor_columns(split['train']))
regr.fit(X_train, y_train)
Calculating descriptors...


637it [00:05, 111.49it/s]
GradientBoostingRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
y_train_pred = regr.predict(X_train)

sklearn.metrics.mean_absolute_error(y_train, y_train_pred)
0.14833562767542
X_val, y_val, _, _,_ = preprocess_data(
    add_descriptor_columns(split['valid']),
    imputer=imputer, fit_imputer=False,
    scaler_X=scaler_X, scaler_y=scaler_y,
    fit_scaler=False
)
Calculating descriptors...


91it [00:00, 125.23it/s]
y_val_pred = regr.predict(X_val)

sklearn.metrics.mean_absolute_error(y_val, y_val_pred)
0.2834360140217522
X_test, y_test, _, _, _ = preprocess_data(
    add_descriptor_columns(split['test']),
    imputer=imputer, fit_imputer=False,
    scaler_X=scaler_X, scaler_y=scaler_y,
    fit_scaler=False
)
Calculating descriptors...


182it [00:01, 119.27it/s]
y_test_pred = regr.predict(X_test)
sklearn.metrics.mean_absolute_error(y_test, y_test_pred)
0.2656885538922705

Not a bad start. Let’s dial in the hyperparameters and improve it a bit more.

Tune hyperparameters:

Use the validation dataset to test the hyperparameters of the model. The learning rate and number of estimators help the model to learn from the training set. The subsample helps to prevent over-fitting by randomly removing some training data when fitting and produces a more general model, which should be better at predicting the results in the validation and test datasets.

I’m using a homemade version of GridSearchCV to find the parameters because I already have a validation set defined.

from sklearn.model_selection import ParameterGrid
params = [    
    {'subsample' : np.linspace(0.5,0.9,5),
    'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07], 
    'n_estimators': [100, 125, 150, 175, 200, 225, 250,],
    }
]
best_score = sklearn.metrics.mean_absolute_error(y_val, y_val_pred)
best_set = {}
for param_set in ParameterGrid(params):
    regr = sklearn.ensemble.GradientBoostingRegressor(
    random_state=1,
    )
    regr.set_params(**param_set)
    regr.fit(X_train,y_train)
    score_MAE = sklearn.metrics.mean_absolute_error(y_val, regr.predict(X_val))
    #lower is better
    # save if best
    if score_MAE < best_score:
        best_score = score_MAE
        best_set = param_set
from tdc.benchmark_group import admet_group
group = admet_group(path = 'data/')
predictions_list = []
for seed in [1, 2, 3, 4, 5]:
    benchmark = group.get('Caco2_Wang') 
    predictions = {}
    name = benchmark['name']
    train, test = benchmark['train_val'], benchmark['test']
    print(f"Seed {seed}:")
    X_train, y_train, imputer, scaler_X, scaler_y = preprocess_data(add_descriptor_columns(train))
    X_test, y_test, _, _, _ = preprocess_data(
        add_descriptor_columns(test), 
        imputer=imputer, fit_imputer=False, 
        scaler_X=scaler_X, fit_scaler=False, 
        scaler_y=scaler_y
    )
    
    regr = sklearn.ensemble.GradientBoostingRegressor(
        random_state=seed,
    )
    regr.set_params(**best_set)
    regr.fit(X_train, y_train)

    y_pred_test_scaled = regr.predict(X_test)
    y_pred_test = scaler_y.inverse_transform(y_pred_test_scaled.reshape(-1,1))
    y_pred_test = y_pred_test.reshape(-1)
    
    predictions = {}
    prediction_dict = {name: y_pred_test}
    predictions_list.append(prediction_dict)
    
results = group.evaluate_many(predictions_list)
print(results)
Found local copy...


Seed 1:
Calculating descriptors...


728it [00:06, 115.24it/s]


Calculating descriptors...


182it [00:01, 113.16it/s]


Seed 2:
Calculating descriptors...


728it [00:06, 114.21it/s]


Calculating descriptors...


182it [00:01, 111.07it/s]


Seed 3:
Calculating descriptors...


728it [00:06, 114.39it/s]


Calculating descriptors...


182it [00:01, 111.53it/s]


Seed 4:
Calculating descriptors...


728it [00:06, 114.49it/s]


Calculating descriptors...


182it [00:01, 111.84it/s]


Seed 5:
Calculating descriptors...


728it [00:06, 114.72it/s]


Calculating descriptors...


182it [00:01, 110.93it/s]


{'caco2_wang': [0.274, 0.004]}

As of 2024/07/21, this is a first place score on the leaderboard!

The current leader is MapLight, Jim Notwell, 0.276 ± 0.005

print('Number of parameters: '+str(len(regr.get_params())))
Number of parameters: 21
regr.get_params()
{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.06,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 250,
 'n_iter_no_change': None,
 'random_state': 5,
 'subsample': 0.5,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}