Source code for deeprank.generate.DataGenerator

import importlib
import copy
import os
import re
import sys
import warnings
from collections import OrderedDict

import h5py
import numpy as np

import deeprank
from deeprank import config
from deeprank.config import logger
from deeprank.generate import GridTools as gt
import pdb2sql
from pdb2sql.align import align as align_along_axis
from pdb2sql.align import align_interface

    from tqdm import tqdm
except ImportError:
    def tqdm(x):
        return x

    from pycuda import driver, compiler, gpuarray, tools
    import pycuda.autoinit
except ImportError:

[docs]def _printif(string, cond): return print(string) if cond else None
[docs]class DataGenerator(object): def __init__(self, chain1, chain2, pdb_select=None, pdb_source=None, pdb_native=None, pssm_source=None, align=None, compute_targets=None, compute_features=None, data_augmentation=None, hdf5='database.h5', mpi_comm=None): """Generate the data (features/targets/maps) required for deeprank. Args: chain1 (str): First chain ID chain2 (str): Second chain ID pdb_select (list(str), optional): List of individual conformation for mapping pdb_source (list(str), optional): List of folders where to find the pdbs for mapping pdb_native (list(str), optional): List of folders where to find the native comformations, nust set it if having targets to compute in parameter "compute_targets". pssm_source (list(str), optional): List of folders where to find the PSSM files align (dict, optional): Dicitionary to align the compexes, e.g. align = {"selection":{"chainID":["A","B"]},"axis":"z"}} e.g. align = {"selection":"interface","plane":"xy"} if "selection" is not specified the entire complex is used for alignement compute_targets (list(str), optional): List of python files computing the targets, "pdb_native" must be set if having targets to compute. compute_features (list(str), optional): List of python files computing the features data_augmentation (int, optional): Number of rotation performed one each complex hdf5 (str, optional): name of the hdf5 file where the data is saved, default to 'database.h5' mpi_comm (MPI_COMM): MPI COMMUNICATOR Raises: NotADirectoryError: if the source are not found Example: >>> from deeprank.generate import * >>> # sources to assemble the data base >>> pdb_source = ['./1AK4/decoys/'] >>> pdb_native = ['./1AK4/native/'] >>> pssm_source = ['./1AK4/pssm_new/'] >>> h5file = '1ak4.hdf5' >>> >>> #init the data assembler >>> database = DataGenerator(chain1='C', >>> chain2='D', >>> pdb_source=pdb_source, >>> pdb_native=pdb_native, >>> pssm_source=pssm_source, >>> data_augmentation=None, >>> compute_targets=['deeprank.targets.dockQ'], >>> compute_features=['deeprank.features.AtomicFeature', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) """ self.chain1 = chain1 self.chain2 = chain2 self.pdb_select = pdb_select or [] self.pdb_source = pdb_source or [] self.pdb_native = pdb_native or [] self.pssm_source = pssm_source self.align = align if self.pssm_source is not None: config.PATH_PSSM_SOURCE = self.pssm_source self.compute_targets = compute_targets self.compute_features = compute_features self.data_augmentation = data_augmentation self.hdf5 = hdf5 self.mpi_comm = mpi_comm # set helper attributes self.all_pdb = [] self.all_native = [] self.pdb_path = [] self.feature_error = [] self.grid_error = [] self.map_error = [] self.logger = logger # handle the pdb_select if not isinstance(self.pdb_select, list): self.pdb_select = [self.pdb_select] # handle the sources if not isinstance(self.pdb_source, list): self.pdb_source = [self.pdb_source] # handle pssm source pssm_features = ('deeprank.features.FullPSSM', 'deeprank.features.PSSM_IC') if self.compute_features and \ set.intersection(set(pssm_features), set(self.compute_features)): if config.PATH_PSSM_SOURCE is None: raise ValueError( 'You must provide "pssm_source" to compute PSSM features.') # get all the conformation path for src in self.pdb_source: if os.path.isdir(src): self.all_pdb += [os.path.join(src, fname) for fname in os.listdir(src) if fname.endswith('.pdb')] elif os.path.isfile(src): self.all_pdb.append(src) # handle the native if not isinstance(self.pdb_native, list): self.pdb_native = [self.pdb_native] for src in self.pdb_native: if os.path.isdir(src): self.all_native += [os.path.join(src, fname) for fname in os.listdir(src)] if os.path.isfile(src): self.all_native.append(src) # filter the cplx if required if self.pdb_select: for i in self.pdb_select: self.pdb_path += list(filter(lambda x: i in x, self.all_pdb)) else: self.pdb_path = self.all_pdb # ==================================================================================== # # CREATE THE DATABASE ALL AT ONCE IF ALL OPTIONS ARE GIVEN # # ====================================================================================
[docs] def create_database( self, verbose=False, remove_error=True, prog_bar=False, contact_distance=8.5, random_seed=None): """Create the hdf5 file architecture and compute the features/targets. Args: verbose (bool, optional): Print creation details remove_error (bool, optional): remove the groups that errored prog_bar (bool, optional): use tqdm contact_distance (float): contact distance cutoff, defaults to 8.5Å random_seed (int): random seed for getting rotation axis and angle Raises: ValueError: If creation of the group errored. Example: >>> # sources to assemble the data base >>> pdb_source = ['./1AK4/decoys/'] >>> pdb_native = ['./1AK4/native/'] >>> pssm_source = ['./1AK4/pssm_new/'] >>> h5file = '1ak4.hdf5' >>> >>> #init the data assembler >>> database = DataGenerator(chain1='C', >>> chain2='D', >>> pdb_source=pdb_source, >>> pdb_native=pdb_native, >>> pssm_source=pssm_source, >>> data_augmentation=None, >>> compute_targets = ['deeprank.targets.dockQ'], >>> compute_features = ['deeprank.features.AtomicFeature', >>> 'deeprank.features.PSSM_IC', >>> 'deeprank.features.BSA'], >>> hdf5=h5file) >>> >>> #create new files >>> database.create_database(prog_bar=True) """ # check decoy pdb files if not self.pdb_path: raise ValueError(f"Decoy pdb files not found. Check class " f"parameters 'pdb_source' and 'pdb_select'.") # deals with the parallelization self.local_pdbs = self.pdb_path if self.mpi_comm is not None: rank = self.mpi_comm.Get_rank() size = self.mpi_comm.Get_size() else: size = 1 if size > 1: if rank == 0: pdbs = [self.pdb_path[i::size] for i in range(size)] self.local_pdbs = pdbs[0] # send to other procs for iP in range(1, size): self.mpi_comm.send(pdbs[iP], dest=iP, tag=11) else: # receive procs self.local_pdbs = self.mpi_comm.recv(source=0, tag=11) # change hdf5 name h5path, h5name = os.path.split(self.hdf5) self.hdf5 = os.path.join(h5path, f"{rank:03d}_{h5name}") # open the file self.f5 = h5py.File(self.hdf5, 'w') # set metadata to hdf5 file self.f5.attrs['DeepRank_version'] = deeprank.__version__ self.f5.attrs['pdb_source'] = [ os.path.abspath(f) for f in self.pdb_source] self.f5.attrs['pdb_native'] = [ os.path.abspath(f) for f in self.pdb_native] self.f5.attrs['pssm_source'] = os.path.abspath( self.pssm_source) if self.compute_features is not None: self.f5.attrs['features'] = self.compute_features if self.compute_targets is not None: self.f5.attrs['targets'] = self.compute_targets ################################################## # Start generating HDF5 database ################################################## f'\n# Start creating HDF5 database: {self.hdf5}') # get the local progress bar desc = '{:25s}'.format('Creating database') cplx_tqdm = tqdm(self.local_pdbs, desc=desc, disable=not prog_bar) for cplx in cplx_tqdm: cplx_tqdm.set_postfix(mol=os.path.basename(cplx))'\nProcessing PDB file: {cplx}') # names of the molecule mol_name = os.path.splitext(os.path.basename(cplx))[0] mol_aug_name_list = [] try: ################################################ # get the pdbs of the conformation and its ref # for the original data (not augmetned one) ################################################ if verbose: f'\nMolecule: {mol_name}.' f'\nStart generating top HDF5 group "{mol_name}"...' f'\n{"":4s}Reading PDB data into database...') # get the bare name of the molecule # and define the name of the native # i.e. 1AK4_100w -> 1AK4 bare_mol_name = mol_name.split('_')[0] ref_name = bare_mol_name + '.pdb' # check if we have a decoy or native # and find the reference if mol_name == bare_mol_name: ref = cplx else: if len(self.all_native) > 0: ref = list( filter(lambda x: ref_name in x, self.all_native)) if len(ref) == 0: raise ValueError('Native not found') else: if len(ref) > 1: warnings.warn( f'Multiple native reference found, here used {ref[0]}') ref = ref[0] if ref == '': ref = None else: ref = None # crete a subgroup for the molecule molgrp = self.f5.require_group(mol_name) molgrp.attrs['type'] = 'molecule' # add the ref and the complex self._add_pdb(molgrp, cplx, 'complex') if ref is not None: self._add_pdb(molgrp, ref, 'native') if verbose: f'{"":4s}Generated subgroup "complex"' f' to store pdb data of the current model.') if ref: f'{"":4s}Generated subgroup "native"' f' to store pdb data of the reference molecule.') ################################################ # add the features ################################################ feature_error_flag = False # when False: success; when True: failed if self.compute_features is not None: if verbose: f'{"":4s}Calculating features...') molgrp.require_group('features') molgrp.require_group('features_raw') feature_error_flag = self._compute_features(self.compute_features, molgrp['complex'][( )], molgrp['features'], molgrp['features_raw'], self.chain1, self.chain2, self.logger) if feature_error_flag: self.feature_error += [mol_name] # ignore the targets/grid/augmentation computation # and directly go to next molecule. Remove errored # molecule later. # Otherwise, keep computing and report errored mol. if remove_error: continue if verbose: if not feature_error_flag or not remove_error: f'\n{"":4s}Generated subgroup "features"' f' to store xyz-based feature values.' f'{"":4s}Generated subgroup "features_raw"' f' to store human read feature values') ################################################ # add the targets ################################################ if self.compute_targets is not None: if verbose: f'{"":4s}Calculating targets...') molgrp.require_group('targets') self._compute_targets(self.compute_targets, molgrp['complex'][()], molgrp['targets']) if verbose: f'{"":4s}Generated subgroup "targets" ' f'to store targets, such as BIN_CLASS, dockQ, etc.') ################################################ # add the box center ################################################ if verbose: f'{"":4s}Calculating grid box center...') grid_error_flag = False molgrp.require_group('grid_points') try: center = self._get_grid_center( molgrp['complex'][()], contact_distance) molgrp['grid_points'].create_dataset( 'center', data=center) if verbose: f'{"":4s}Generated subgroup "grid_points"' f' to store grid box center.') except ValueError as ex: grid_error_flag = True self.grid_error += [mol_name] self.logger.exception(ex) if remove_error: continue ################################################ # DATA AUGMENTATION ################################################ # GET ALL THE NAMES if self.data_augmentation is not None: mol_aug_name_list = [ mol_name + '_r%03d' % (idir + 1) for idir in range( self.data_augmentation)] else: mol_aug_name_list = [] if verbose and mol_aug_name_list: f'{"":2s}Start augmenting data' f' with {self.data_augmentation} times...') # loop over the complexes for mol_aug_name in mol_aug_name_list: # crete a subgroup for the molecule molgrp = self.f5.require_group(mol_aug_name) molgrp.attrs['type'] = 'molecule' # copy the ref into it if ref is not None: self._add_pdb(molgrp, ref, 'native') # get the rotation axis and angle if self.align is None: axis, angle = pdb2sql.transform.get_rot_axis_angle( random_seed) else: axis, angle = self._get_aligned_rotation_axis_angle(random_seed, self.align) # create the new pdb and get molecule center # molecule center is the origin of rotation) mol_center = self._add_aug_pdb( molgrp, cplx, 'complex', axis, angle) # copy the targets/features if 'targets' in self.f5[mol_name]: self.f5.copy(mol_name + '/targets/', molgrp) self.f5.copy(mol_name + '/features/', molgrp) # rotate the feature self._rotate_feature( molgrp, axis, angle, mol_center) # grid center used to create grid box molgrp.require_group('grid_points') center = pdb2sql.transform.rot_xyz_around_axis( self.f5[mol_name + '/grid_points/center'], axis, angle, mol_center) molgrp['grid_points'].create_dataset( 'center', data=center) # store the rotation axis/angl/center as attriutes # in case we need them later molgrp.attrs['axis'] = axis molgrp.attrs['angle'] = angle molgrp.attrs['center'] = mol_center # cache aug mols if original mol has errored features if feature_error_flag: self.feature_error += mol_aug_name_list if grid_error_flag: self.grid_error += mol_aug_name_list if verbose and mol_aug_name_list: f'{"":2s}Completed data augmentation' f' and generated top HDF5 groups, e.g. {mol_aug_name}.') ################################################ # Successul message ################################################ if verbose: f'\nSuccessfully generated top HDF5 group "{mol_name}".\n') # all other errors except BaseException: raise ################################################## # Post processing ################################################## # Remove errored molecules errored_mol = list(set(self.feature_error + self.grid_error)) if errored_mol: if remove_error: for mol in errored_mol: del self.f5[mol] if self.feature_error: f'Molecules with errored features are removed:' f'\n{self.feature_error}') if self.grid_error: f'Molecules with errored grid points are removed:' f'\n{self.grid_error}') else: if self.feature_error: self.logger.warning( f'The following molecules have errored features:' f'\n{self.feature_error}') if self.grid_error: self.logger.warning( f'The following molecules have errored grid points:' f'\n{self.grid_error}') # close the file self.f5.close() f'\n# Successfully created database: {self.hdf5}\n')
[docs] def aug_data(self, augmentation, keep_existing_aug=True, random_seed=None): """Augment exiting original PDB data and features. Args: augmentation(int): Times of augmentation keep_existing_aug (bool, optional): Keep existing augmentated data. If False, existing aug will be removed. Defaults to True. Examples: >>> database = DataGenerator(h5='database.h5') >>> database.aug_data(augmentation=3, append=True) >>> grid_info = { >>> 'number_of_points': [30,30,30], >>> 'resolution': [1.,1.,1.], >>> 'atomic_densities': {'C':1.7, 'N':1.55, 'O':1.52, 'S':1.8}, >>> } >>> database.map_features(grid_info) """ # check if file exists if not os.path.isfile(self.hdf5): raise FileNotFoundError( 'File %s does not exists' % self.hdf5) # get the folder names f5 = h5py.File(self.hdf5, 'a') fnames = f5.keys() # get the non rotated ones fnames_original = list( filter(lambda x: not'_r\d+$', x), fnames)) # get the rotated ones fnames_augmented = list( filter(lambda x:'_r\d+$', x), fnames)) aug_id_start = 0 if keep_existing_aug: exiting_augs = list( filter(lambda x:[0] + r'_r\d+$', x), fnames_augmented)) aug_id_start += len(exiting_augs) else: for i in fnames_augmented: del f5[i] f'{"":s}\n# Start augmenting data' f' with {augmentation} times...') # GET ALL THE NAMES for mol_name in fnames_original: mol_aug_name_list = [ mol_name + '_r%03d' % (idir + 1) for idir in range(aug_id_start, aug_id_start + augmentation)] # loop over the complexes for mol_aug_name in mol_aug_name_list: # crete a subgroup for the molecule molgrp = f5.require_group(mol_aug_name) molgrp.attrs['type'] = 'molecule' # copy the ref into it if 'native' in f5[mol_name]: f5.copy(mol_name + '/native', molgrp) # get the rotation axis and angle if self.align is None: axis, angle = pdb2sql.transform.get_rot_axis_angle( random_seed) else: axis, angle = self._get_aligned_rotation_axis_angle(random_seed, self.align) # create the new pdb and get molecule center # molecule center is the origin of rotation) mol_center = self._add_aug_pdb( molgrp, f5[mol_name + '/complex'][()], 'complex', axis, angle) # copy the targets/features if 'targets' in f5[mol_name]: f5.copy(mol_name + '/targets/', molgrp) f5.copy(mol_name + '/features/', molgrp) # rotate the feature self._rotate_feature(molgrp, axis, angle, mol_center) # grid center used to create grid box molgrp.require_group('grid_points') center = pdb2sql.transform.rot_xyz_around_axis( f5[mol_name + '/grid_points/center'], axis, angle, mol_center) molgrp['grid_points'].create_dataset( 'center', data=center) # store the rotation axis/angl/center as attriutes # in case we need them later molgrp.attrs['axis'] = axis molgrp.attrs['angle'] = angle molgrp.attrs['center'] = mol_center f5.close() f'\n# Successfully augmented data in {self.hdf5}')
# ==================================================================================== # # ADD FEATURES TO AN EXISTING DATASET # # ====================================================================================
[docs] def add_feature(self, remove_error=True, prog_bar=True): """Add a feature to an existing hdf5 file. Args: remove_error (bool): remove errored molecule prog_bar (bool, optional): use tqdm Example: >>> h5file = '1ak4.hdf5' >>> >>> #init the data assembler >>> database = DataGenerator(compute_features = ['deeprank.features.ResidueDensity'], >>> hdf5=h5file) >>> >>> database.add_feature(remove_error=True, prog_bar=True) """ # check if file exists if not os.path.isfile(self.hdf5): raise FileNotFoundError( 'File %s does not exists' % self.hdf5) # get the folder names f5 = h5py.File(self.hdf5, 'a') fnames = f5.keys() # get the non rotated ones fnames_original = list( filter(lambda x: not'_r\d+$', x), fnames)) # get the rotated ones fnames_augmented = list( filter(lambda x:'_r\d+$', x), fnames)) # check feature_error if not self.feature_error: self.feature_error = [] # computes the features of the original desc = '{:25s}'.format('Add features') for cplx_name in tqdm( fnames_original, desc=desc, ncols=100, disable=not prog_bar): # molgrp molgrp = f5[cplx_name] error_flag = False if self.compute_features is not None: # the internal features molgrp.require_group('features') molgrp.require_group('features_raw') error_flag = self._compute_features(self.compute_features, molgrp['complex'][()], molgrp['features'], molgrp['features_raw'], self.chain1, self.chain2, self.logger) if error_flag: self.feature_error += [cplx_name] # copy the data from the original to the augmented for cplx_name in fnames_augmented: # group of the molecule aug_molgrp = f5[cplx_name] # get the source group mol_name = re.split(r'_r\d+',[0] src_molgrp = f5[mol_name] # get the rotation parameters axis = aug_molgrp.attrs['axis'] angle = aug_molgrp.attrs['angle'] center = aug_molgrp.attrs['center'] # copy the features to the augmented for k in molgrp['features']: if k not in aug_molgrp['features']: # copy data = src_molgrp['features/' + k][()] aug_molgrp.require_group('features') aug_molgrp.create_dataset( "features/" + k, data=data) # rotate self._rotate_feature( aug_molgrp, axis, angle, center, feat_name=[k]) # find errored augmented molecules tmp_aug_error = [] for mol in self.feature_error: tmp_aug_error += list(filter(lambda x: mol in x, fnames_augmented)) self.feature_error += tmp_aug_error # Remove errored molecules if self.feature_error: if remove_error: for mol in self.feature_error: del f5[mol] f'Molecules with errored features are removed:\n' f'{self.feature_error}') else: self.logger.warning( f"The following molecules has errored features:\n" f'{self.feature_error}') # close the file f5.close()
# ==================================================================================== # # ADD TARGETS TO AN EXISTING DATASET # # ====================================================================================
[docs] def add_unique_target(self, targdict): """Add identical targets for all the complexes in the datafile. This is usefull if you want to add the binary class of all the complexes created from decoys or natives Args: targdict (dict): Example: {'DOCKQ':1.0} >>> database = DataGenerator(hdf5='1ak4.hdf5') >>> database.add_unique_target({'DOCKQ':1.0}) """ # check if file exists if not os.path.isfile(self.hdf5): raise FileNotFoundError( 'File %s does not exists' % self.hdf5) f5 = h5py.File(self.hdf5, 'a') for mol in list(f5.keys()): targrp = f5[mol].require_group('targets') for name, value in targdict.items(): targrp.create_dataset(name, data=np.array([value])) f5.close()
[docs] def add_target(self, prog_bar=False): """Add a target to an existing hdf5 file. Args: prog_bar (bool, optional): Use tqdm Example: >>> h5file = '1ak4.hdf5' >>> >>> #init the data assembler >>> database = DataGenerator(compute_targets =['deeprank.targets.binary_class'], >>> hdf5=h5file) >>> >>> database.add_target(prog_bar=True) """ # check if file exists if not os.path.isfile(self.hdf5): raise FileNotFoundError( 'File %s does not exists' % self.hdf5) # name of the hdf5 file f5 = h5py.File(self.hdf5, 'a') # get the folder names fnames = f5.keys() # get the non rotated ones fnames_original = list( filter(lambda x: not'_r\d+$', x), fnames)) fnames_augmented = list( filter(lambda x:'_r\d+$', x), fnames)) # compute the targets of the original desc = '{:25s}'.format('Add targets') for cplx_name in tqdm(fnames_original, desc=desc, ncols=100, disable=not prog_bar): # group of the molecule molgrp = f5[cplx_name] # add the targets if self.compute_targets is not None: molgrp.require_group('targets') self._compute_targets(self.compute_targets, molgrp['complex'][()], molgrp['targets']) # copy the targets of the original to the rotated for cplx_name in fnames_augmented: # group of the molecule aug_molgrp = f5[cplx_name] # get the source group mol_name = re.split(r'_r\d+',[0] src_molgrp = f5[mol_name] # copy the targets to the augmented for k in molgrp['targets']: if k not in aug_molgrp['targets']: data = src_molgrp['targets/' + k][()] aug_molgrp.require_group('targets') aug_molgrp.create_dataset( "targets/" + k, data=data) # close the file f5.close()
[docs] def realign_complexes(self, align, compute_features=None, pssm_source=None): """Align all the complexes already present in the HDF5. Arguments: align {dict} -- alignement dictionary (see __init__) Keyword Arguments: compute_features {list} -- list of features to be computed if None computes the features specified in the attrs['features'] of the file (if present) pssm_source {str} -- path of the pssm files. If None the source specfied in the attrs['pssm_source'] will be used (if present) (default: {None}) Raises: ValueError: If no PSSM detected Example: >>> database = DataGenerator(hdf5='1ak4.hdf5') >>> # if comute_features and pssm_source are not specified >>> # the values in hdf5.attrs['features'] and hdf5.attrs['pssm_source'] will be used >>> database.realign_complex(align={'axis':'x'}, >>> compute_features['deeprank.features.X'], >>> pssm_source='./1ak4_pssm/') """ f5 = h5py.File(self.hdf5, 'a') mol_names = f5.keys() f'\n# Start aligning the HDF5 database: {self.hdf5}') # deal with the features if self.compute_features is None: if compute_features is None: if 'features' in f5.attrs: self.compute_features = list(f5.attrs['features']) else: self.compute_features = compute_features # deal with the pssm source if self.pssm_source is not None: config.PATH_PSSM_SOURCE = self.pssm_source elif pssm_source is not None: config.PATH_PSSM_SOURCE = pssm_source elif 'pssm_source' in f5.attrs: config.PATH_PSSM_SOURCE = f5.attrs['pssm_source'] else: raise ValueError('No pssm source detected') # loop over the complexes desc = '{:25s}'.format('Add features') for mol in tqdm(mol_names, desc=desc, ncols=100): # align the pdb molgrp = f5[mol] pdb = molgrp['complex'][()] sqldb = self._get_aligned_sqldb(pdb, align) data = sqldb.sql2pdb() data = np.array(data).astype('|S78') molgrp['complex'][...] = data # remove prexisting features old_dir = ['features', 'features_raw', 'mapped_features'] for od in old_dir: if od in molgrp: del molgrp[od] # the internal features molgrp.require_group('features') molgrp.require_group('features_raw') # compute features error_flag = self._compute_features(self.compute_features, molgrp['complex'][()], molgrp['features'], molgrp['features_raw'], self.chain1, self.chain2, self.logger) f5.close()
# ==================================================================================== # # PRECOMPUTE TEH GRID POINTS # # ====================================================================================
[docs] def _get_grid_center(self, pdb, contact_distance): sqldb = pdb2sql.interface(pdb) contact_atoms = sqldb.get_contact_atoms(cutoff=contact_distance, chain1=self.chain1, chain2=self.chain2) tmp = [] for i in contact_atoms.values(): tmp.extend(i) contact_atoms = list(set(tmp)) center_contact = np.mean( np.array(sqldb.get('x,y,z', rowID=contact_atoms)), 0) sqldb._close() return center_contact
[docs] def precompute_grid(self, grid_info, contact_distance=8.5, prog_bar=False, time=False, try_sparse=True): # name of the hdf5 file f5 = h5py.File(self.hdf5, 'a') # check all the input PDB files mol_names = f5.keys() # get the local progress bar desc = '{:25s}'.format('Precompute grid points') mol_tqdm = tqdm(mol_names, desc=desc, disable=not prog_bar) if not prog_bar: print(desc, ':', self.hdf5) sys.stdout.flush() # loop over the data files for mol in mol_tqdm: mol_tqdm.set_postfix(mol=mol) # compute the data we want on the grid gt.GridTools(molgrp=f5[mol], chain1=self.chain1, chain2=self.chain2, number_of_points=grid_info['number_of_points'], resolution=grid_info['resolution'], contact_distance=contact_distance, time=time, prog_bar=prog_bar, try_sparse=try_sparse) f5.close()
# ==================================================================================== # # MAP THE FEATURES TO THE GRID # # ====================================================================================
[docs] def map_features(self, grid_info={}, cuda=False, gpu_block=None, cuda_kernel='kernel_map.c', cuda_func_name='gaussian', try_sparse=True, reset=False, use_tmpdir=False, time=False, prog_bar=True, grid_prog_bar=False, remove_error=True): """Map the feature on a grid of points centered at the interface. If features to map are not given, they will be are automatically determined for each molecule. Otherwise, given features will be mapped for all molecules (i.e. existing mapped features will be recalculated). Args: grid_info (dict): Informaton for the grid. See for details. cuda (bool, optional): Use CUDA gpu_block (None, optional): GPU block size to be used cuda_kernel (str, optional): filename containing CUDA kernel cuda_func_name (str, optional): The name of the function in the kernel try_sparse (bool, optional): Try to save the grids as sparse format reset (bool, optional): remove grids if some are already present use_tmpdir (bool, optional): use a scratch directory time (bool, optional): time the mapping process prog_bar (bool, optional): use tqdm for each molecule grid_prog_bar (bool, optional): use tqdm for each grid remove_error (bool, optional): remove the data that errored Example: >>> #init the data assembler >>> database = DataGenerator(hdf5='1ak4.hdf5') >>> >>> # map the features >>> grid_info = { >>> 'number_of_points': [30,30,30], >>> 'resolution': [1.,1.,1.], >>> 'atomic_densities': {'C':1.7, 'N':1.55, 'O':1.52, 'S':1.8}, >>> } >>> >>> database.map_features(grid_info,try_sparse=True,time=False,prog_bar=True) """ # default CUDA cuda_func = None cuda_atomic = None # disable CUDA when using MPI if self.mpi_comm is not None: if self.mpi_comm.Get_size() > 1: if cuda: self.logger.warning( 'CUDA mapping disabled when using MPI') cuda = False # name of the hdf5 file f5 = h5py.File(self.hdf5, 'a') # check all the input PDB files mol_names = f5.keys() if len(mol_names) == 0: f5.close() raise ValueError(f'No molecules found in {self.hdf5}.') ################################################################ # Check grid_info ################################################################ # fills in the grid data if not provided: default = NONE grid_info_ref = copy.deepcopy(grid_info) grinfo = ['number_of_points', 'resolution'] for gr in grinfo: if gr not in grid_info: grid_info[gr] = None # by default we do not map atomic densities if 'atomic_densities' not in grid_info: grid_info['atomic_densities'] = None # fills in the features mode if somes are missing: default = IND modes = ['atomic_densities_mode', 'feature_mode'] for m in modes: if m not in grid_info: grid_info[m] = 'ind' ################################################################ # ################################################################ # sanity check for cuda if cuda and gpu_block is None: # pragma: no cover f'GPU block automatically set to 8 x 8 x 8. ' f'You can set block size with gpu_block=[n,m,k]') gpu_block = [8, 8, 8] # initialize cuda if cuda: # pragma: no cover # compile cuda module npts = grid_info['number_of_points'] res = grid_info['resolution'] module = self._compile_cuda_kernel(cuda_kernel, npts, res) # get the cuda function for the atomic/residue feature cuda_func = self._get_cuda_function( module, cuda_func_name) # get the cuda function for the atomic densties cuda_atomic_name = 'atomic_densities' cuda_atomic = self._get_cuda_function( module, cuda_atomic_name) # get the local progress bar desc = '{:25s}'.format('Map Features') mol_tqdm = tqdm(mol_names, desc=desc, disable=not prog_bar) if not prog_bar:'{desc}: {self.hdf5}') # loop over the data files for mol in mol_tqdm: mol_tqdm.set_postfix(mol=mol) # Determine which feature to map # if feature not given, then determine it for each molecule if 'feature' not in grid_info_ref: # if we havent mapped anything yet or if we reset if 'mapped_features' not in list(f5[mol].keys()) or reset: grid_info['feature'] = list( f5[mol + '/features'].keys()) # if we have already mapped stuff elif 'mapped_features' in list(f5[mol].keys()): # feature name all_feat = list(f5[mol + '/features'].keys()) # feature already mapped mapped_feat = list( f5[mol + '/mapped_features/Feature_ind'].keys()) # we select only the feture that were not mapped yet grid_info['feature'] = [] for feat_name in all_feat: if not any(map(lambda x: x.startswith(feat_name + '_'), mapped_feat)): grid_info['feature'].append(feat_name) try: # compute the data we want on the grid gt.GridTools( molgrp=f5[mol], chain1=self.chain1, chain2=self.chain2, number_of_points=grid_info['number_of_points'], resolution=grid_info['resolution'], atomic_densities=grid_info['atomic_densities'], atomic_densities_mode=grid_info['atomic_densities_mode'], feature=grid_info['feature'], feature_mode=grid_info['feature_mode'], cuda=cuda, gpu_block=gpu_block, cuda_func=cuda_func, cuda_atomic=cuda_atomic, time=time, prog_bar=grid_prog_bar, try_sparse=try_sparse) except BaseException: self.map_error.append(mol) self.logger.exception( f'Error during the mapping of {mol}') # remove the molecule with issues if self.map_error: if remove_error: for mol in self.map_error: del f5[mol] self.logger.warning( f"Molecules with errored feature mapping are removed:\n" f"{self.map_error}") else: self.logger.warning( f"The following moleclues have errored feature mapping:\n" f"{self.map_error}") # close he hdf5 file f5.close()
# ==================================================================================== # # REMOVE DATA FROM THE DATA SET # # ====================================================================================
[docs] def remove(self, feature=True, pdb=True, points=True, grid=False): """Remove data from the data set. Equivalent to the cleandata command line tool. Once the data has been removed from the file it is impossible to add new features/targets Args: feature (bool, optional): Remove the features pdb (bool, optional): Remove the pdbs points (bool, optional): remove teh grid points grid (bool, optional): remove the maps """ self.logger.debug('Remove features') # name of the hdf5 file f5 = h5py.File(self.hdf5, 'a') # get the folder names mol_names = f5.keys() for name in mol_names: mol_grp = f5[name] if feature and 'features' in mol_grp: del mol_grp['features'] del mol_grp['features_raw'] if pdb and 'complex' in mol_grp and 'native' in mol_grp: del mol_grp['complex'] del mol_grp['native'] if points and 'grid_points' in mol_grp: del mol_grp['grid_points'] if grid and 'mapped_features' in mol_grp: del mol_grp['mapped_features'] f5.close() # reclaim the space os.system('h5repack %s _tmp.h5py' % self.hdf5) os.system('mv _tmp.h5py %s' % self.hdf5)
# ==================================================================================== # # Simply tune or test the kernel # # ====================================================================================
[docs] def _tune_cuda_kernel(self, grid_info, cuda_kernel='kernel_map.c', func='gaussian'): # pragma: no cover """Tune the CUDA kernel using the kernel tuner Args: grid_info (dict): information for the grid definition cuda_kernel (str, optional): file containing the kernel func (str, optional): function in the kernel to be used Raises: ValueError: If the tuner has not been used """ try: from kernel_tuner import tune_kernel except BaseException: print( 'Install the Kernel Tuner: \n \t\t pip install kernel_tuner') print('') # fills in the grid data if not provided: default = NONE grinfo = ['number_of_points', 'resolution'] for gr in grinfo: if gr not in grid_info: raise ValueError( '%s must be specified to tune the kernel') # define the grid nx, ny, nz = grid_info['number_of_points'] dx, dy, dz = grid_info['resolution'] lx, ly, lz = nx * dx, ny * dy, nz * dz x = np.linspace(0, lx, nx) y = np.linspace(0, ly, ny) z = np.linspace(0, lz, nz) # create the dictionary containing the tune parameters tune_params = OrderedDict() tune_params['block_size_x'] = [2, 4, 8, 16, 32] tune_params['block_size_y'] = [2, 4, 8, 16, 32] tune_params['block_size_z'] = [2, 4, 8, 16, 32] # define the final grid grid = np.zeros(grid_info['number_of_points']) # arguments of the CUDA function x0, y0, z0 = np.float32(0), np.float32(0), np.float32(0) alpha = np.float32(0) args = [alpha, x0, y0, z0, x, y, z, grid] # dimensionality problem_size = grid_info['number_of_points'] # get the kernel kernel = os.path.dirname( os.path.abspath(__file__)) + '/' + cuda_kernel kernel_code_template = open(kernel, 'r').read() npts = grid_info['number_of_points'] res = grid_info['resolution'] kernel_code = kernel_code_template % { 'nx': npts[0], 'ny': npts[1], 'nz': npts[2], 'RES': np.max(res)} tunable_kernel = self._tunable_kernel(kernel_code) # tune tune_kernel(func, tunable_kernel, problem_size, args, tune_params)
# ==================================================================================== # # Simply test the kernel # # ====================================================================================
[docs] def _test_cuda(self, grid_info, gpu_block=8, cuda_kernel='kernel_map.c', func='gaussian'): # pragma: no cover """Test the CUDA kernel. Args: grid_info (dict): Information for the grid definition gpu_block (int, optional): GPU block size to be used cuda_kernel (str, optional): File containing the kernel func (str, optional): function in the kernel to be used Raises: ValueError: If the kernel has not been installed """ from time import time # fills in the grid data if not provided: default = NONE grinfo = ['number_of_points', 'resolution'] for gr in grinfo: if gr not in grid_info: raise ValueError( '%s must be specified to tune the kernel') # get the cuda function npts = grid_info['number_of_points'] res = grid_info['resolution'] module = self._compile_cuda_kernel(cuda_kernel, npts, res) cuda_func = self._get_cuda_function(module, func) # define the grid nx, ny, nz = grid_info['number_of_points'] dx, dy, dz = grid_info['resolution'] lx, ly, lz = nx * dx, ny * dy, nz * dz # create the coordinate x = np.linspace(0, lx, nx) y = np.linspace(0, ly, ny) z = np.linspace(0, lz, nz) # book memp on the gpu x_gpu = gpuarray.to_gpu(x.astype(np.float32)) y_gpu = gpuarray.to_gpu(y.astype(np.float32)) z_gpu = gpuarray.to_gpu(z.astype(np.float32)) grid_gpu = gpuarray.zeros( grid_info['number_of_points'], np.float32) # make sure we have three block value if not isinstance(gpu_block, list): gpu_block = [gpu_block] * 3 # get the grid gpu_grid = [int(np.ceil(n / b)) for b, n in zip(gpu_block, grid_info['number_of_points'])] print('GPU BLOCK:', gpu_block) print('GPU GRID :', gpu_grid) xyz_center = np.random.rand(500, 3).astype(np.float32) alpha = np.float32(1) t0 = time() for xyz in xyz_center: x0, y0, z0 = xyz cuda_func(alpha, x0, y0, z0, x_gpu, y_gpu, z_gpu, grid_gpu, block=tuple(gpu_block), grid=tuple(gpu_grid)) print('Done in: %f ms' % ((time() - t0) * 1000))
# ==================================================================================== # # Routines needed to handle CUDA # # ====================================================================================
[docs] @staticmethod def _compile_cuda_kernel(cuda_kernel, npts, res): # pragma: no cover """Compile the cuda kernel. Args: cuda_kernel (str): filename npts (tuple(int)): number of grid points in each direction res (tuple(float)): resolution in each direction Returns: compiler.SourceModule: compiled kernel """ # get the cuda kernel path kernel = os.path.dirname( os.path.abspath(__file__)) + '/' + cuda_kernel kernel_code_template = open(kernel, 'r').read() kernel_code = kernel_code_template % { 'nx': npts[0], 'ny': npts[1], 'nz': npts[2], 'RES': np.max(res)} # compile the kernel mod = compiler.SourceModule(kernel_code) return mod
[docs] @staticmethod def _get_cuda_function(module, func_name): # pragma: no cover """Get a single function from the compiled kernel. Args: module (compiler.SourceModule): compiled kernel module func_name (str): Name of the funtion Returns: func: cuda function """ cuda_func = module.get_function(func_name) return cuda_func
# tranform the kernel to a tunable one
[docs] @staticmethod def _tunable_kernel(kernel): # pragma: no cover """Make a tunale kernel. Args: kernel (str): String of the kernel Returns: TYPE: tunable kernel """ switch_name = { 'blockDim.x': 'block_size_x', 'blockDim.y': 'block_size_y', 'blockDim.z': 'block_size_z'} for old, new in switch_name.items(): kernel = kernel.replace(old, new) return kernel
# ==================================================================================== # # FILTER DATASET # # ===================================================================================
[docs] def _filter_cplx(self): """Filter the name of the complexes.""" # read the class ID with open(self.pdb_select) as f: pdb_name = f.readlines() pdb_name = [name.split()[0] + '.pdb' for name in pdb_name] # create the filters tmp_path = [] for name in pdb_name: tmp_path += list(filter(lambda x: name in x, self.pdb_path)) # update the pdb_path self.pdb_path = tmp_path
# ==================================================================================== # # FEATURES ROUTINES # # ====================================================================================
[docs] @staticmethod def _compute_features(feat_list, pdb_data, featgrp, featgrp_raw, chain1, chain2, logger): """Compute the features. Args: feat_list (list(str)): list of function name, e.g., ['deeprank.features.ResidueDensity', 'deeprank.features.PSSM_IC'] pdb_data (bytes): PDB translated in bytes featgrp (str): name of the group where to store the xyz feature featgrp_raw (str): name of the group where to store the raw feature chain1 (str): First chain ID chain2 (str): Second chain ID logger (logger): name of logger object Return: bool: error happened or not """ error_flag = False # when False: success; when True: failed for feat in feat_list: try: feat_module = importlib.import_module(feat, package=None) feat_module.__compute_feature__(pdb_data, featgrp, featgrp_raw, chain1, chain2) except Exception as ex: logger.exception(ex) error_flag = True return error_flag
# ==================================================================================== # # TARGETS ROUTINES # # ====================================================================================
[docs] @staticmethod def _compute_targets(targ_list, pdb_data, targrp): """Compute the targets. Args: targ_list (list(str)): list of function name pdb_data (bytes): PDB translated in btes targrp (str): name of the group where to store the targets logger (logger): name of logger object """ for targ in targ_list: targ_module = importlib.import_module(targ, package=None) targ_module.__compute_target__(pdb_data, targrp)
# ==================================================================================== # # ADD PDB FILE # # ====================================================================================
[docs] def _add_pdb(self, molgrp, pdbfile, name): """Add a pdb to a molgrp. Args: molgrp (str): mopl group where tp add the pdb pdbfile (str): psb file to add name (str): dataset name in the hdf5 molgroup """ # no alignement if self.align is None: # read the pdb and extract the ATOM lines with open(pdbfile, 'r') as fi: data = [line.split('\n')[0] for line in fi if line.startswith('ATOM')] # some alignement elif isinstance(self.align, dict): sqldb = self._get_aligned_sqldb(pdbfile, self.align) data = sqldb.sql2pdb() # PDB default line length is 80 # data = np.array(data).astype('|S78') molgrp.create_dataset(name, data=data)
# @staticmethod
[docs] def _get_aligned_sqldb(self, pdbfile, dict_align): """return a sqldb of the pdb that is aligned as specified in the dict Arguments: pdbfile {str} -- path ot the pdb dict_align {dict} -- dictionanry of options to align the pdb """ if 'selection' not in dict_align.keys(): dict_align['selection'] = {} if 'export' not in dict_align.keys(): dict_align['export'] = False if dict_align['selection'] == 'interface': sqldb = align_interface(pdbfile, plane=dict_align['plane'], export=dict_align['export'], chain1=self.chain1, chain2=self.chain2) else: sqldb = align_along_axis(pdbfile, axis=dict_align['axis'], export=dict_align['export'], **dict_align['selection']) return sqldb
# ==================================================================================== # # AUGMENTED DATA # # ====================================================================================
[docs] @staticmethod def _get_aligned_rotation_axis_angle(random_seed, dict_align): """Returns the axis and angle of rotation for data augmentation with aligned complexes Arguments: random_seed {int} -- random seed of rotation dict_align {dict} -- the dict describing the alignement Returns: list(float): axis of rotation float: angle of rotation """ if random_seed is not None: np.random.seed(random_seed) angle = 2 * np.pi * np.random.rand() if 'plane' in dict_align.keys(): if dict_align['plane'] == 'xy': axis = [0., 0., 1.] elif dict_align['plane'] == 'xz': axis = [0., 1., 0.] elif dict_align['plane'] == 'yz': axis = [1., 0., 0.] else: raise ValueError("plane must be xy, xz or yz") elif 'axis' in dict_align.keys(): if dict_align['axis'] == 'x': axis = [1., 0., 0.] elif dict_align['axis'] == 'y': axis = [0., 1., 0.] elif dict_align['axis'] == 'z': axis = [0., 0., 1.] else: raise ValueError("axis must be x, y or z") else: raise ValueError('dict_align must contains plane or axis') return axis, angle
# add a rotated pdb structure to the database
[docs] def _add_aug_pdb(self, molgrp, pdbfile, name, axis, angle): """Add augmented pdbs to the dataset. Args: molgrp (str): name of the molgroup pdbfile (str): pdb file name name (str): name of the dataset axis (list(float)): axis of rotation angle (float): angle of rotation dict_align (dict): dict for alignement of the original pdb Returns: list(float): center of the molecule """ # create the sqldb and extract positions if self.align is None: sqldb = pdb2sql.pdb2sql(pdbfile) else: sqldb = self._get_aligned_sqldb(pdbfile, self.align) # rotate the positions pdb2sql.transform.rot_axis(sqldb, axis, angle) # get molecule center xyz = sqldb.get('x,y,z') center = np.mean(xyz, 0) # get the pdb-format data data = sqldb.sql2pdb() data = np.array(data).astype('|S78') molgrp.create_dataset(name, data=data) # close the db sqldb._close() return center
# rotate th xyz-formatted feature in the database
[docs] @staticmethod def _rotate_feature(molgrp, axis, angle, center, feat_name='all'): """Rotate the raw feature values. Args: molgrp (str): name pf the molgrp axis (list(float)): axis of rotation angle (float): angle of rotation center (list(float)): center of rotation feat_name (str): name of the feature to rotate or 'all' """ if feat_name == 'all': feat = list(molgrp['features'].keys()) else: feat = feat_name if not isinstance(feat, list): feat = list(feat) for fn in feat: # extract the data data = molgrp['features/' + fn][()] # if data not empty if data.shape[0] != 0: # xyz xyz = data[:, 1:4] # get rotated xyz xyz_rot = pdb2sql.transform.rot_xyz_around_axis( xyz, axis, angle, center) # put back the data molgrp['features/' + fn][:, 1:4] = xyz_rot