Source code for deeprank.generate.GridTools


import itertools
import sys
from time import time

import numpy as np
from scipy.signal import bspline
import pdb2sql

from deeprank.config import logger
from deeprank.tools import sparse

try:
    from tqdm import tqdm
except ImportError:
    def tqdm(x):
        return x


[docs]def logif(string, cond): return logger.info(string) if cond else None
[docs]class GridTools(object): def __init__(self, molgrp, chain1, chain2, number_of_points=30, resolution=1., atomic_densities=None, atomic_densities_mode='ind', feature=None, feature_mode='ind', contact_distance=8.5, cuda=False, gpu_block=None, cuda_func=None, cuda_atomic=None, prog_bar=False, time=False, try_sparse=True): """Map the feature of a complex on the grid. Args: molgrp(str): name of the group of the molecule in the HDF5 file. chain1 (str): First chain ID. chain2 (str): Second chain ID. number_of_points(int, optional): number of points we want in each direction of the grid. resolution(float, optional): distance(in Angs) between two points. atomic_densities(dict, optional): dictionary of element types with their vdw radius, see deeprank.config.atom_vdw_radius_noH atomic_densities_mode(str, optional): Mode for mapping (deprecated must be 'ind'). feature(None, optional): Name of the features to be mapped. By default all the features present in hdf5_file['< molgrp > /features/] will be mapped. feature_mode(str, optional): Mode for mapping (deprecated must be 'ind'). contact_distance(float, optional): the dmaximum distance between two contact atoms default 8.5Å. cuda(bool, optional): Use CUDA or not. gpu_block(tuple(int), optional): GPU block size to use. cuda_func(None, optional): Name of the CUDA function to be used for the mapping of the features. Must be present in kernel_cuda.c. cuda_atomic(None, optional): Name of the CUDA function to be used for the mapping of the atomic densities. Must be present in kernel_cuda.c. prog_bar(bool, optional): print progression bar for individual grid (default False). time(bool, optional): print timing statistic for individual grid (default False). try_sparse(bool, optional): Try to store the matrix in sparse format (default True). """ # mol and hdf5 file self.molgrp = molgrp self.mol_basename = molgrp.name # chain IDs self.chain1 = chain1 self.chain2 = chain2 # hdf5 file to strore data self.hdf5 = self.molgrp.file self.try_sparse = try_sparse # parameter of the grid if number_of_points is not None: if not isinstance(number_of_points, list): number_of_points = [number_of_points] * 3 self.npts = np.array(number_of_points).astype('int') if resolution is not None: if not isinstance(resolution, list): resolution = [resolution] * 3 self.res = np.array(resolution) # feature requested self.atomic_densities = atomic_densities self.feature = feature # mapping mode self.feature_mode = feature_mode self.atomic_densities_mode = atomic_densities_mode # cuda support self.cuda = cuda if self.cuda: # pragma: no cover self.gpu_block = gpu_block self.gpu_grid = [int(np.ceil(n / b)) for b, n in zip(self.gpu_block, self.npts)] # cuda self.cuda_func = cuda_func self.cuda_atomic = cuda_atomic # parameter of the atomic system self.atom_xyz = None self.atom_index = None self.atom_type = None # grid points self.x = None self.y = None self.z = None # grids for calculation of atomic densities self.xgrid = None self.ygrid = None self.zgrid = None # dictionaries of atomic densities self.atdens = {} # conversion from boh to angs for VMD visualization self.bohr2ang = 0.52918 # contact distance to locate the interface self.contact_distance = contact_distance # progress bar self.local_tqdm = lambda x: tqdm(x) if prog_bar else x self.time = time # if we already have an output containing the grid # we update the existing features _update_ = False if self.mol_basename + '/grid_points/x' in self.hdf5: _update_ = True if _update_: logif(f'\n=Updating grid data for {self.mol_basename}.', self.time) self.update_feature() else: logif(f'\n= Creating grid and grid data for {self.mol_basename}.', self.time) self.create_new_data() ################################################################
[docs] def create_new_data(self): """Create new feature for a given complex.""" # get the position/atom type .. of the complex self.read_pdb() # get the contact atoms and interface center self.get_contact_center() # define the grid self.define_grid_points() # save the grid points self.export_grid_points() # map the features self.add_all_features() # if we wnat the atomic densisties self.add_all_atomic_densities() # cloe the db file self.sqldb._close()
################################################################
[docs] def update_feature(self): """Update existing feature in a complex.""" # get the position/atom type .. of the complex # get self.sqldb self.read_pdb() # read the grid from the hdf5 grid = self.hdf5.get(self.mol_basename + '/grid_points/') self.x, self.y, self.z = grid['x'][()], grid['y'][()], grid['z'][()] # create the grid self.ygrid, self.xgrid, self.zgrid = np.meshgrid( self.y, self.x, self.z) # set the resolution/dimension self.npts = np.array([len(self.x), len(self.y), len(self.z)]) self.res = np.array( [self.x[1] - self.x[0], self.y[1] - self.y[0], self.z[1] - self.z[0]]) # map the features self.add_all_features() # if we want the atomic densisties self.add_all_atomic_densities() # cloe the db file self.sqldb._close()
################################################################
[docs] def read_pdb(self): """Create a sql databse for the pdb.""" self.sqldb = pdb2sql.interface(self.molgrp['complex'][()])
# get the contact atoms and interface center
[docs] def get_contact_center(self): """Get the center of conact atoms.""" contact_atoms = self.sqldb.get_contact_atoms( cutoff=self.contact_distance, chain1=self.chain1, chain2=self.chain2) tmp = [] for i in contact_atoms.values(): tmp.extend(i) contact_atoms = list(set(tmp)) # get interface center self.center_contact = np.mean( np.array(self.sqldb.get('x,y,z', rowID=contact_atoms)), 0)
################################################################ # shortcut to add all the feature a # and atomic densities in just one line ################################################################ # add all the residue features to the data
[docs] def add_all_features(self): """Add all the features toa given molecule.""" # map the features if self.feature is not None: # map the residue features dict_data = self.map_features(self.feature) # save to hdf5 if specfied t0 = time() logif('-- Save Features to HDF5', self.time) self.hdf5_grid_data(dict_data, 'Feature_%s' % (self.feature_mode)) logif(' Total %f ms' % ((time() - t0) * 1000), self.time)
# add all the atomic densities to the data
[docs] def add_all_atomic_densities(self): """Add all atomic densities.""" # if we wnat the atomic densisties if self.atomic_densities is not None: # compute the atomic densities self.map_atomic_densities() # save to hdf5 t0 = time() logif('-- Save Atomic Densities to HDF5', self.time) self.hdf5_grid_data(self.atdens, 'AtomicDensities_%s' % (self.atomic_densities_mode)) logif(' Total %f ms' % ((time() - t0) * 1000), self.time)
################################################################ # define the grid points # there is an issue maybe with the ordering # In order to visualize the data in VMD the Y and X axis must be inverted ... # I keep it like that for now as it should not matter for the CNN # and maybe we don't need atomic denisties as features ################################################################
[docs] def define_grid_points(self): """Define the grid points.""" logif('-- Define %dx%dx%d grid ' % (self.npts[0], self.npts[1], self.npts[2]), self.time) logif('-- Resolution of %1.2fx%1.2fx%1.2f Angs' % (self.res[0], self.res[1], self.res[2]), self.time) halfdim = 0.5 * (self.npts * self.res) center = self.center_contact low_lim = center - halfdim hgh_lim = low_lim + self.res * (np.array(self.npts) - 1) self.x = np.linspace(low_lim[0], hgh_lim[0], self.npts[0]) self.y = np.linspace(low_lim[1], hgh_lim[1], self.npts[1]) self.z = np.linspace(low_lim[2], hgh_lim[2], self.npts[2]) # there is something fishy about the meshgrid 3d # the axis are a bit screwy .... # i dont quite get why the ordering is like that self.ygrid, self.xgrid, self.zgrid = np.meshgrid( self.y, self.x, self.z)
################################################################ # Atomic densities # as defined in the paper about ligand in protein ################################################################ # compute all the atomic densities data
[docs] def map_atomic_densities(self, only_contact=True): """Map the atomic densities to the grid. Args: only_contact(bool, optional): Map only the contact atoms Raises: ImportError: Description """ mode = self.atomic_densities_mode logif('-- Map atomic densities on %dx%dx%d grid (mode=%s)' % (self.npts[0], self.npts[1], self.npts[2], mode), self.time) # prepare the cuda memory if self.cuda: # pragma: no cover # try to import pycuda try: from pycuda import driver, compiler, gpuarray, tools import pycuda.autoinit except BaseException: raise ImportError("Error when importing pyCuda in GridTools") # book mem on the gpu x_gpu = gpuarray.to_gpu(self.x.astype(np.float32)) y_gpu = gpuarray.to_gpu(self.y.astype(np.float32)) z_gpu = gpuarray.to_gpu(self.z.astype(np.float32)) grid_gpu = gpuarray.zeros(self.npts, np.float32) # get the contact atoms if only_contact: index = self.sqldb.get_contact_atoms(cutoff=self.contact_distance, chain1=self.chain1, chain2=self.chain2) else: index = {self.chain1: self.sqldb.get('rowID', chainID=self.chain1), self.chain2: self.sqldb.get('rowID', chainID=self.chain2)} # loop over all the data we want for elementtype, vdw_rad in self.local_tqdm( self.atomic_densities.items()): t0 = time() xyzA = np.array(self.sqldb.get( 'x,y,z', rowID=index[self.chain1], element=elementtype)) xyzB = np.array(self.sqldb.get( 'x,y,z', rowID=index[self.chain2], element=elementtype)) tprocess = time() - t0 t0 = time() # if we use CUDA if self.cuda: # pragma: no cover # reset the grid grid_gpu *= 0 # get the atomic densities of chain A for pos in xyzA: x0, y0, z0 = pos.astype(np.float32) vdw = np.float32(vdw_rad) self.cuda_atomic( vdw, x0, y0, z0, x_gpu, y_gpu, z_gpu, grid_gpu, block=tuple( self.gpu_block), grid=tuple( self.gpu_grid)) atdensA = grid_gpu.get() # reset the grid grid_gpu *= 0 # get the atomic densities of chain B for pos in xyzB: x0, y0, z0 = pos.astype(np.float32) vdw = np.float32(vdw_rad) self.cuda_atomic( vdw, x0, y0, z0, x_gpu, y_gpu, z_gpu, grid_gpu, block=tuple( self.gpu_block), grid=tuple( self.gpu_grid)) atdensB = grid_gpu.get() # if we don't use CUDA else: # init the grid atdensA = np.zeros(self.npts) atdensB = np.zeros(self.npts) # run on the atoms for pos in xyzA: atdensA += self.densgrid(pos, vdw_rad) # run on the atoms for pos in xyzB: atdensB += self.densgrid(pos, vdw_rad) # create the final grid: A - B if mode == 'diff': self.atdens[elementtype] = atdensA - atdensB # create the final grid: A + B elif mode == 'sum': self.atdens[elementtype] = atdensA + atdensB # create the final grid: A and B elif mode == 'ind': self.atdens[elementtype + '_chain1'] = atdensA self.atdens[elementtype + '_chain2'] = atdensB else: raise ValueError(f'Atomic density mode {mode} not recognized') tgrid = time() - t0 logif(' Process time %f ms' % (tprocess * 1000), self.time) logif(' Grid time %f ms' % (tgrid * 1000), self.time)
# compute the atomic denisties on the grid
[docs] def densgrid(self, center, vdw_radius): """Function to map individual atomic density on the grid. The formula is equation (1) of the Koes paper Protein-Ligand Scoring with Convolutional NN Arxiv:1612.02751v1 Args: center (list(float)): position of the atoms vdw_radius (float): vdw radius of the atom Returns: TYPE: np.array (mapped density) """ x0, y0, z0 = center dd = np.sqrt((self.xgrid - x0)**2 + (self.ygrid - y0)**2 + (self.zgrid - z0)**2) dgrid = np.zeros(self.npts) index_shortd = dd < vdw_radius index_longd = (dd >= vdw_radius) & (dd < 1.5 * vdw_radius) dgrid[index_shortd] = np.exp(-2 * dd[index_shortd]**2 / vdw_radius**2) dgrid[index_longd] = 4. / np.e**2 / vdw_radius**2 * dd[index_longd]**2 \ - 12. / np.e**2 / vdw_radius * dd[index_longd] + 9. / np.e**2 return dgrid
################################################################ # Residue or Atomic features # read the file provided in input # and map it on the grid ################################################################ # map residue a feature on the grid
[docs] def map_features(self, featlist, transform=None): """Map individual feature to the grid. For residue based feature the feature file must be of the format chainID residue_name(3-letter) residue_number [values] For atom based feature it must be chainID residue_name(3-letter) residue_number atome_name [values] Args: featlist (list(str)): list of features to be mapped transform (callable, optional): transformation of the feature (?) Returns: np.array: Mapped features Raises: ImportError: Description ValueError: Description """ # declare the total dictionary dict_data = {} # prepare the cuda memory if self.cuda: # pragma: no cover # try to import pycuda try: from pycuda import driver, compiler, gpuarray, tools import pycuda.autoinit except BaseException: raise ImportError("Error when importing pyCuda in GridTools") # book mem on the gpu x_gpu = gpuarray.to_gpu(self.x.astype(np.float32)) y_gpu = gpuarray.to_gpu(self.y.astype(np.float32)) z_gpu = gpuarray.to_gpu(self.z.astype(np.float32)) grid_gpu = gpuarray.zeros(self.npts, np.float32) # loop over all the features required for feature_name in featlist: logif('-- Map %s on %dx%dx%d grid ' % (feature_name, self.npts[0], self.npts[1], self.npts[2]), self.time) # read the data featgrp = self.molgrp['features'] if feature_name in featgrp.keys(): data = featgrp[feature_name][:] else: print('Error Feature not found \n\tPossible features: ' + ' | '.join(featgrp.keys())) raise ValueError( 'feature %s not found in the file' % (feature_name)) # detect if we have a xyz format # or a byte format # define how many elements (ntext) # are present before the feature values # xyz: 4 (chain x y z) # byte - residue: 3 (chain resSeq resName) # byte - atomic: 4 (chain resSeq resName name) # all the format are now xyz feature_type = 'xyz' ntext = 4 # test if the transform is callable # and test it on the first line of the data # get the data on the first line if data.shape[0] != 0: data_test = data[0, ntext:] # define the length of the output if transform is None: nFeat = len(data_test) elif callable(transform): nFeat = len(transform(data_test)) else: print('Error transform in map_feature must be callable') return None else: nFeat = 1 # declare the dict # that will in fine holds all the data if nFeat == 1: if self.feature_mode == 'ind': dict_data[feature_name + '_chain1'] = np.zeros(self.npts) dict_data[feature_name + '_chain2'] = np.zeros(self.npts) else: dict_data[feature_name] = np.zeros(self.npts) else: # do we need that ?! for iF in range(nFeat): if self.feature_mode == 'ind': dict_data[feature_name + '_chain1_%03d' % iF] = np.zeros(self.npts) dict_data[feature_name + '_chain2_%03d' % iF] = np.zeros(self.npts) else: dict_data[feature_name + '_%03d' % iF] = np.zeros(self.npts) # skip empty features if data.shape[0] == 0: continue # rest the grid and get the x y z values if self.cuda: # pragma: no cover grid_gpu *= 0 # timing tprocess = 0 tgrid = 0 # map all the features for line in self.local_tqdm(data): t0 = time() # if the feature was written with xyz data # i.e chain x y z values if feature_type == 'xyz': chain = [self.chain1, self.chain2][int(line[0])] pos = line[1:ntext] feat_values = np.array(line[ntext:]) # if the feature was written with bytes # i.e chain resSeq resName (name) values # TODO deprecated? else: # decode the line line = line.decode('utf-8').split() # get the position of the resnumber chain, resName, resNum = line[0], line[1], line[2] # get the atom name for atomic data if feature_type == 'atomic': atName = line[3] # get the position # TODO deprecated? the definition of center postion is # different from that in features e.g. BSA.py if feature_type == 'residue': pos = np.mean(np.array(self.sqldb.get( 'x,y,z', chainID=chain, resSeq=resNum)), 0) sql_resName = list(set(self.sqldb.get( 'resName', chainID=chain, resSeq=resNum))) else: pos = np.array(self.sqldb.get( 'x,y,z', chainID=chain, resSeq=resNum, name=atName))[0] sql_resName = list(set(self.sqldb.get( 'resName', chainID=chain, resSeq=resNum, name=atName))) # check if the resname correspond if len(sql_resName) == 0: print('Error: SQL query returned empty list') print('Tip : Make sure the parameter file ') print('Tip : corresponds to the pdb file %s' % (self.sqldb.pdbfile)) sys.exit() else: sql_resName = sql_resName[0] if resName != sql_resName: print('Residue Name Error in the Feature file ') print('Feature File: chain %s resNum %s resName %s' % (chain, resNum, resName)) print('SQL data : chain %s resNum %s resName %s' % (chain, resNum, sql_resName)) sys.exit() # get the values of the feature(s) for thsi residue feat_values = np.array(list(map(float, line[ntext:]))) # postporcess the data if callable(transform): feat_values = transform(feat_values) # handle the mode fname = feature_name if self.feature_mode == "diff": coeff = {self.chain1: 1, self.chain2: -1}[chain] else: coeff = 1 if self.feature_mode == "ind": chain_name = {self.chain1: '1', self.chain2: '2'}[chain] fname = feature_name + "_chain" + chain_name tprocess += time() - t0 t0 = time() # map this feature(s) on the grid(s) if not self.cuda: if nFeat == 1: dict_data[fname] += coeff * \ self.featgrid(pos, feat_values) else: for iF in range(nFeat): dict_data[fname + '_%03d' % iF] += coeff * \ self.featgrid(pos, feat_values[iF]) # try to use cuda to speed it up else: # pragma: no cover if nFeat == 1: x0, y0, z0 = pos.astype(np.float32) alpha = np.float32(coeff * feat_values) self.cuda_func(alpha, x0, y0, z0, x_gpu, y_gpu, z_gpu, grid_gpu, block=tuple(self.gpu_block), grid=tuple(self.gpu_grid)) else: raise ValueError( 'CUDA only possible for single-valued features') tgrid += time() - t0 if self.cuda: # pragma: no cover dict_data[fname] = grid_gpu.get() driver.Context.synchronize() logif(' Process time %f ms' % (tprocess * 1000), self.time) logif(' Grid time %f ms' % (tgrid * 1000), self.time) return dict_data
# compute the a given feature on the grid
[docs] def featgrid(self, center, value, type_='fast_gaussian'): """Map an individual feature (atomic or residue) on the grid. Args: center (list(float)): position of the feature center value (float): value of the feature type_ (str, optional): method to map Returns: np.array: Mapped feature Raises: ValueError: Description """ # shortcut for th center x0, y0, z0 = center sigma = np.sqrt(1. / 2) beta = 0.5 / (sigma**2) # simple Gaussian if type_ == 'gaussian': dd = np.sqrt((self.xgrid - x0)**2 + (self.ygrid - y0)**2 + (self.zgrid - z0)**2) dd = value * np.exp(-beta * dd) return dd # fast gaussian elif type_ == 'fast_gaussian': cutoff = 5. * beta dd = np.sqrt((self.xgrid - x0)**2 + (self.ygrid - y0)**2 + (self.zgrid - z0)**2) dgrid = np.zeros(self.npts) dgrid[dd < cutoff] = value * np.exp(-beta * dd[dd < cutoff]) return dgrid # Bsline elif type_ == 'bspline': spline_order = 4 spl = bspline((self.xgrid - x0) / self.res[0], spline_order) \ * bspline((self.ygrid - y0) / self.res[1], spline_order) \ * bspline((self.zgrid - z0) / self.res[2], spline_order) dd = value * spl return dd # nearest neighbours elif type_ == 'nearest': # distances dx = np.abs(self.x - x0) dy = np.abs(self.y - y0) dz = np.abs(self.z - z0) # index indx = np.argsort(dx)[:2] indy = np.argsort(dy)[:2] indz = np.argsort(dz)[:2] # weight wx = dx[indx] wx /= np.sum(wx) wy = dy[indy] wy /= np.sum(wy) wz = dx[indz] wz /= np.sum(wz) # define the points indexes = [indx, indy, indz] points = list(itertools.product(*indexes)) # define the weight W = [wx, wy, wz] W = list(itertools.product(*W)) W = [np.sum(iw) for iw in W] # put that on the grid dgrid = np.zeros(self.npts) for w, pt in zip(W, points): dgrid[pt[0], pt[1], pt[2]] = w * value return dgrid # default else: raise ValueError(f'Options not recognized for the grid {type_}')
################################################################ # export the grid points for external calculations of some # features. For example the electrostatic potential etc ... ################################################################
[docs] def export_grid_points(self): """export the grid points to the hdf5 file.""" grd = self.hdf5.require_group(self.mol_basename + '/grid_points') grd.create_dataset('x', data=self.x) grd.create_dataset('y', data=self.y) grd.create_dataset('z', data=self.z) # add center or update it when the old value is different if 'center' not in grd: grd.create_dataset('center', data=self.center_contact) elif not all(grd['center'][()] == self.center_contact): grd['center'][...] = self.center_contact
# save the data in the hdf5 file
[docs] def hdf5_grid_data(self, dict_data, data_name): """Save the mapped feature to the hdf5 file. Args: dict_data(dict): feature values stored as a dict data_name(str): feature name """ # get the group og the feature feat_group = self.hdf5.require_group( self.mol_basename + '/mapped_features/' + data_name) # gothrough all the feature elements for key, value in dict_data.items(): # remove only subgroup if key in feat_group: del feat_group[key] # create new one sub_feat_group = feat_group.create_group(key) # try a sparse representation if self.try_sparse: # check if the grid is sparse or not t0 = time() spg = sparse.FLANgrid() spg.from_dense(value, beta=1E-2) if self.time: print(' Sparsing time %f ms' % ((time() - t0) * 1000)) # if we have a sparse matrix if spg.sparse: sub_feat_group.attrs['sparse'] = spg.sparse sub_feat_group.attrs['type'] = 'sparse_matrix' sub_feat_group.create_dataset( 'index', data=spg.index, compression='gzip', compression_opts=9) sub_feat_group.create_dataset( 'value', data=spg.value, compression='gzip', compression_opts=9) else: sub_feat_group.attrs['sparse'] = spg.sparse sub_feat_group.attrs['type'] = 'dense_matrix' sub_feat_group.create_dataset( 'value', data=spg.value, compression='gzip', compression_opts=9) else: sub_feat_group.attrs['sparse'] = False sub_feat_group.attrs['type'] = 'dense_matrix' sub_feat_group.create_dataset( 'value', data=value, compression='gzip', compression_opts=9)
########################################################################