{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Neural Fingerprints\n", "\n", "We create atom, bond, and edge tensors from molecule SMILES using `chemml.chem.tensorize_molecules` in order to build neural fingerprints using `chemml.models.NeuralGraphHidden` and `chemml.models.NeuralGraphOutput` modules. These neural fingerprints are then used as features to train a simple feed forward neural network to predict densities of small organic compounds using tensorflow. \n", "\n", "Here we import a sample dataset from ChemML library which has the SMILES codes for 500 small organic molecules with their densities in $kg/m^3$. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from chemml.datasets import load_organic_density\n", "molecules, target, dragon_subset = load_organic_density()\n", "target = np.asarray(target['density_Kg/m3'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Building `chemml.chem.Molecule` objects from molecule SMILES. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from chemml.chem import Molecule\n", "mol_objs_list = []\n", "for smi in molecules['smiles']:\n", " mol = Molecule(smi, 'smiles')\n", " mol.hydrogens('add')\n", " mol.to_xyz('MMFF', maxIters=10000, mmffVariant='MMFF94s')\n", " mol_objs_list.append(mol)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Molecule tensors can be used to create neural graph fingerprints using `chemml.models`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tensorising molecules in batches of 100 ...\n", "\u001b[1m500/500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 16ms/step \n", "Merging batch tensors ... [DONE]\n" ] } ], "source": [ "from chemml.chem import tensorise_molecules\n", "xatoms, xbonds, xedges = tensorise_molecules(molecules=mol_objs_list, max_degree=5, \n", " max_atoms=None, n_jobs=-1, batch_size=100, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Splitting and preprocessing the data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import ShuffleSplit\n", "from sklearn.preprocessing import StandardScaler\n", "y_scale = StandardScaler()\n", "rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=42)\n", "\n", "for train, test in rs.split(mol_objs_list):\n", " xatoms_train = xatoms[train]\n", " xatoms_test = xatoms[test]\n", " xbonds_train = xbonds[train]\n", " xbonds_test = xbonds[test]\n", " xedges_train = xedges[train]\n", " xedges_test = xedges[test]\n", " target_train = target[train]\n", " target_test = target[test]\n", " target_train = y_scale.fit_transform(target_train.reshape(-1,1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data:\n", "\n", "Atoms: (400, 57, 62)\n", "Bonds: (400, 57, 5, 6)\n", "Edges: (400, 57, 5)\n", "Target: (400, 1)\n", "\n", "Testing data:\n", "\n", "Atoms: (100, 57, 62)\n", "Bonds: (100, 57, 5, 6)\n", "Edges: (100, 57, 5)\n", "Target: (100,)\n" ] } ], "source": [ "print('Training data:\\n')\n", "print('Atoms: ',xatoms_train.shape)\n", "print('Bonds: ',xbonds_train.shape)\n", "print('Edges: ',xedges_train.shape)\n", "print('Target: ',target_train.shape)\n", "\n", "print('\\nTesting data:\\n')\n", "print('Atoms: ',xatoms_test.shape)\n", "print('Bonds: ',xbonds_test.shape)\n", "print('Edges: ',xedges_test.shape)\n", "print('Target: ',target_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building the Neural Fingerprints\n", "\n", "The atom, bond, and edge tensors are used here to build 200 neural fingerprints of width 8 (i.e., the size atomic neighborhood which will be considered in the convolution process). " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Neural Fingerprint Shape: (None, 200)\n" ] } ], "source": [ "from chemml.models import NeuralGraphHidden, NeuralGraphOutput\n", "from tensorflow.keras.layers import Input, add\n", "import tensorflow as tf\n", "tf.random.set_seed(42)\n", "\n", "conv_width = 8\n", "fp_length = 200\n", "\n", "num_molecules = xatoms_train.shape[0]\n", "max_atoms = xatoms_train.shape[1]\n", "max_degree = xbonds_train.shape[2]\n", "num_atom_features = xatoms_train.shape[-1]\n", "num_bond_features = xbonds_train.shape[-1]\n", "\n", "# Creating input layers for atoms ,bonds and edge information\n", "atoms0 = Input(name='atom_inputs', shape=(max_atoms, num_atom_features),batch_size=None)\n", "bonds = Input(name='bond_inputs', shape=(max_atoms, max_degree, num_bond_features),batch_size=None)\n", "edges = Input(name='edge_inputs', shape=(max_atoms, max_degree), dtype='int32',batch_size=None)\n", "\n", "# Defining the convolved atom feature layers \n", "atoms1 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms0, bonds, edges])\n", "atoms2 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms1, bonds, edges])\n", "\n", "# Defining the outputs of each (convolved) atom feature layer to fingerprint\n", "fp_out0 = NeuralGraphOutput(fp_length, activation='softmax')([atoms0,bonds,edges])\n", "fp_out1 = NeuralGraphOutput(fp_length, activation='softmax')([atoms1,bonds,edges])\n", "fp_out2 = NeuralGraphOutput(fp_length, activation='softmax')([atoms2,bonds,edges])\n", "\n", "# Sum outputs to obtain fingerprint \n", "final_fp = add([fp_out0, fp_out1, fp_out2])\n", "print('Neural Fingerprint Shape: ',final_fp.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building and training the neural network\n", "\n", "Here, we build and train a simple feed forward neural network using `tensorflow.keras` and provide our neural fingerprints as features. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:TensorFlow GPU support is not available on native Windows for TensorFlow >= 2.11. Even if CUDA/cuDNN are installed, GPU will not be used. Please use WSL2 or the TensorFlow-DirectML plugin.\n" ] }, { "data": { "text/html": [ "
Model: \"functional\"\n",
"\n"
],
"text/plain": [
"\u001b[1mModel: \"functional\"\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃ Layer (type) ┃ Output Shape ┃ Param # ┃ Connected to ┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ atom_inputs │ (None, 57, 62) │ 0 │ - │\n",
"│ (InputLayer) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bond_inputs │ (None, 57, 5, 6) │ 0 │ - │\n",
"│ (InputLayer) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ edge_inputs │ (None, 57, 5) │ 0 │ - │\n",
"│ (InputLayer) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_hidden │ (None, 57, 8) │ 2,720 │ atom_inputs[0][0… │\n",
"│ (NeuralGraphHidden) │ │ │ bond_inputs[0][0… │\n",
"│ │ │ │ edge_inputs[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_hidde… │ (None, 57, 8) │ 560 │ neural_graph_hid… │\n",
"│ (NeuralGraphHidden) │ │ │ bond_inputs[0][0… │\n",
"│ │ │ │ edge_inputs[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_output │ (None, 200) │ 13,800 │ atom_inputs[0][0… │\n",
"│ (NeuralGraphOutput) │ │ │ bond_inputs[0][0… │\n",
"│ │ │ │ edge_inputs[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_outpu… │ (None, 200) │ 3,000 │ neural_graph_hid… │\n",
"│ (NeuralGraphOutput) │ │ │ bond_inputs[0][0… │\n",
"│ │ │ │ edge_inputs[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_outpu… │ (None, 200) │ 3,000 │ neural_graph_hid… │\n",
"│ (NeuralGraphOutput) │ │ │ bond_inputs[0][0… │\n",
"│ │ │ │ edge_inputs[0][0] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ add (Add) │ (None, 200) │ 0 │ neural_graph_out… │\n",
"│ │ │ │ neural_graph_out… │\n",
"│ │ │ │ neural_graph_out… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_layer0 │ (None, 128) │ 25,728 │ add[0][0] │\n",
"│ (Dense) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_layer1 │ (None, 64) │ 8,256 │ dense_layer0[0][… │\n",
"│ (Dense) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ main_prediction │ (None, 1) │ 65 │ dense_layer1[0][… │\n",
"│ (Dense) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n",
"\n"
],
"text/plain": [
"┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mConnected to \u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩\n",
"│ atom_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m57\u001b[0m, \u001b[38;5;34m62\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ bond_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m57\u001b[0m, \u001b[38;5;34m5\u001b[0m, \u001b[38;5;34m6\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ edge_inputs │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m57\u001b[0m, \u001b[38;5;34m5\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ - │\n",
"│ (\u001b[38;5;33mInputLayer\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_hidden │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m57\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m2,720\u001b[0m │ atom_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mNeuralGraphHidden\u001b[0m) │ │ │ bond_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ edge_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_hidde… │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m57\u001b[0m, \u001b[38;5;34m8\u001b[0m) │ \u001b[38;5;34m560\u001b[0m │ neural_graph_hid… │\n",
"│ (\u001b[38;5;33mNeuralGraphHidden\u001b[0m) │ │ │ bond_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ edge_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_output │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m200\u001b[0m) │ \u001b[38;5;34m13,800\u001b[0m │ atom_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ (\u001b[38;5;33mNeuralGraphOutput\u001b[0m) │ │ │ bond_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ edge_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_outpu… │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m200\u001b[0m) │ \u001b[38;5;34m3,000\u001b[0m │ neural_graph_hid… │\n",
"│ (\u001b[38;5;33mNeuralGraphOutput\u001b[0m) │ │ │ bond_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ edge_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ neural_graph_outpu… │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m200\u001b[0m) │ \u001b[38;5;34m3,000\u001b[0m │ neural_graph_hid… │\n",
"│ (\u001b[38;5;33mNeuralGraphOutput\u001b[0m) │ │ │ bond_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m… │\n",
"│ │ │ │ edge_inputs[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ add (\u001b[38;5;33mAdd\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m200\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │ neural_graph_out… │\n",
"│ │ │ │ neural_graph_out… │\n",
"│ │ │ │ neural_graph_out… │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_layer0 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m25,728\u001b[0m │ add[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m0\u001b[0m] │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ dense_layer1 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │ dense_layer0[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"├─────────────────────┼───────────────────┼────────────┼───────────────────┤\n",
"│ main_prediction │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │ dense_layer1[\u001b[38;5;34m0\u001b[0m][\u001b[38;5;34m…\u001b[0m │\n",
"│ (\u001b[38;5;33mDense\u001b[0m) │ │ │ │\n",
"└─────────────────────┴───────────────────┴────────────┴───────────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Total params: 57,129 (223.16 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m57,129\u001b[0m (223.16 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Trainable params: 57,129 (223.16 KB)\n", "\n" ], "text/plain": [ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m57,129\u001b[0m (223.16 KB)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
Non-trainable params: 0 (0.00 B)\n", "\n" ], "text/plain": [ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "
| \n", " | ME | \n", "MAE | \n", "MSE | \n", "RMSE | \n", "MSLE | \n", "RMSLE | \n", "MAPE | \n", "MaxAPE | \n", "RMSPE | \n", "MPE | \n", "MaxAE | \n", "deltaMaxE | \n", "r_squared | \n", "std | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "-6.17789 | \n", "15.302616 | \n", "419.635458 | \n", "20.485006 | \n", "0.000286 | \n", "0.016918 | \n", "1.238328 | \n", "7.243467 | \n", "1.715904 | \n", "-0.526769 | \n", "72.840308 | \n", "111.701436 | \n", "0.945338 | \n", "87.617825 | \n", "