Source code for deeprank.generate.NormalizeData

import os
import pickle

import h5py
import numpy as np

from deeprank.tools import sparse


[docs]class NormalizeData(object): def __init__(self, fname, shape=None): """Compute the normalization factor for the features and targets of a given HDF5 file. The normalization of the features is done through the NormParam class that assumes gaussian distribution. Hence the Normalized data should be normally distributed with a 0 mean value and 1 standard deviation. The normalization of the targets is done vian a min/max normalization. As a result the normalized targets should all lie between 0 and 1. By default the output file containing the normalization dictionary is called <hdf5name>_norm.pckl Args: fname (str): name of the hdf5 file shape (tuple(int), optional): shape of the grid in the hdf5 file Example: >>> norm = NormalizeData('1ak4.hdf5') >>> norm.get() """ self.fname = fname self.parameters = {'features': {}, 'targets': {}} self.shape = shape self.fexport = os.path.splitext(self.fname)[0] + '_norm.pckl' self.skip_feature = [] self.skip_target = []
[docs] def get(self): """Get the normalization and write them to file.""" self._extract_shape() self._load() self._extract_data() self._process_data() self._export_data()
[docs] def _load(self): """Load data from already existing normalization file.""" if os.path.isfile(self.fexport): f = open(self.fexport, 'rb') self.parameters = pickle.load(f) f.close() for _, feat_name in self.parameters['features'].items(): for name, _ in feat_name.items(): self.skip_feature.append(name) for target in self.parameters['targets'].keys(): self.skip_target.append(target)
[docs] def _extract_shape(self): """Get the shape of the data in the hdf5 file.""" if self.shape is not None: return f5 = h5py.File(self.fname, 'r') mol = list(f5.keys())[0] mol_data = f5.get(mol) if 'grid_points' in mol_data: nx = mol_data['grid_points']['x'].shape[0] ny = mol_data['grid_points']['y'].shape[0] nz = mol_data['grid_points']['z'].shape[0] self.shape = (nx, ny, nz) else: raise ValueError( 'Impossible to determine sparse grid shape.\\n Specify argument grid_shape=(x,y,z)')
[docs] def _extract_data(self): """Extract the data from the different maps.""" f5 = h5py.File(self.fname, 'r') mol_names = list(f5.keys()) self.nmol = len(mol_names) # loop over the molecules for mol in mol_names: # get the mapped features group data_group = f5.get(mol + '/mapped_features/') # loop over all the feature types for feat_types, feat_names in data_group.items(): # if feature type not in param add if feat_types not in self.parameters['features']: self.parameters['features'][feat_types] = {} # loop over all the feature for name in feat_names: # we skip the target if name in self.skip_feature: continue # create the param if it doesn't already exists if name not in self.parameters['features'][feat_types]: self.parameters['features'][feat_types][name] = NormParam( ) # load the matrix feat_data = data_group[feat_types + '/' + name] if feat_data.attrs['sparse']: mat = sparse.FLANgrid(sparse=True, index=feat_data['index'][:], value=feat_data['value'][:], shape=self.shape).to_dense() else: mat = feat_data['value'][:] # add the parameter (mean and var) self.parameters['features'][feat_types][name].add( np.mean(mat), np.var(mat)) # get the target groups target_group = f5.get(mol + '/targets') # loop over all the targets for tname, tval in target_group.items(): # we skip the already computed target if tname in self.skip_target: continue # create a new item if needed if tname not in self.parameters['targets']: self.parameters['targets'][tname] = MinMaxParam() # update the value self.parameters['targets'][tname].update(tval[()]) f5.close()
[docs] def _process_data(self): """Compute the standard deviation of the data.""" for feat_types, feat_dict in self.parameters['features'].items(): for feat in feat_dict: self.parameters['features'][feat_types][feat].process( self.nmol)
[docs] def _export_data(self): """Pickle the data to file.""" f = open(self.fexport, 'wb') pickle.dump(self.parameters, f) f.close()
[docs]class NormParam(object): def __init__(self, std=0, mean=0, var=0, sqmean=0): """Compute gaussian normalization for a given feature. This class allows to extract the standard deviation, mean value, variance and square root of the mean value of a mapped feature stored in the hdf5 file. As the entire data set is too large to fit in memory, the standard deviation of a given feature is calculated from the std of all the individual grids. This is done following: https://stats.stackexchange.com/questions/25848/how-to-sum-a-standard-deviation: .. math:: \\sigma_{tot}=\\sqrt{\\frac{1}{N}\\sum_i \\sigma_i^2+\\frac{1}{N}\\sum_i\\mu_i^2-(\\frac{1}{N}\\sum_i\\mu_i)^2} Args: std (float, optional): standard deviation mean (float,optional): mean value var (float,optional): variance sqmean (float, optional): square roo of the variance """ self.std = std self.mean = mean self.var = var self.sqmean = sqmean
[docs] def add(self, mean, var): """Add the mean value, sqmean and variance of a new molecule to the corresponding attributes.""" self.mean += mean self.sqmean += mean**2 self.var += var
[docs] def process(self, n): """Compute the standard deviation of the ensemble.""" # normalize the mean and var self.mean /= n self.var /= n self.sqmean /= n # get the std self.std = self.var self.std += self.sqmean self.std -= self.mean**2 self.std = np.sqrt(self.std)
[docs]class MinMaxParam(object): """Compute the min/max of an ensenble of data. This is principally used to normalized the target values Args: minv (float, optional): minimal value maxv (float, optional): maximal value """ def __init__(self, minv=None, maxv=None): self.min = minv self.max = maxv
[docs] def update(self, val): if self.min is None: self.min = val self.max = val else: self.min = min(self.min, val) self.max = max(self.max, val)