Source code for deeprank.generate.NormalizeData
import os
import pickle
import h5py
import numpy as np
from deeprank.tools import sparse
[docs]class NormalizeData(object):
def __init__(self, fname, shape=None):
"""Compute the normalization factor for the features and targets of a
given HDF5 file.
The normalization of the features is done through the NormParam class that assumes gaussian distribution.
Hence the Normalized data should be normally distributed with a 0 mean value and 1 standard deviation.
The normalization of the targets is done vian a min/max normalization. As a result the normalized targets
should all lie between 0 and 1. By default the output file containing the normalization dictionary is called <hdf5name>_norm.pckl
Args:
fname (str): name of the hdf5 file
shape (tuple(int), optional): shape of the grid in the hdf5 file
Example:
>>> norm = NormalizeData('1ak4.hdf5')
>>> norm.get()
"""
self.fname = fname
self.parameters = {'features': {}, 'targets': {}}
self.shape = shape
self.fexport = os.path.splitext(self.fname)[0] + '_norm.pckl'
self.skip_feature = []
self.skip_target = []
[docs] def get(self):
"""Get the normalization and write them to file."""
self._extract_shape()
self._load()
self._extract_data()
self._process_data()
self._export_data()
[docs] def _load(self):
"""Load data from already existing normalization file."""
if os.path.isfile(self.fexport):
f = open(self.fexport, 'rb')
self.parameters = pickle.load(f)
f.close()
for _, feat_name in self.parameters['features'].items():
for name, _ in feat_name.items():
self.skip_feature.append(name)
for target in self.parameters['targets'].keys():
self.skip_target.append(target)
[docs] def _extract_shape(self):
"""Get the shape of the data in the hdf5 file."""
if self.shape is not None:
return
f5 = h5py.File(self.fname, 'r')
mol = list(f5.keys())[0]
mol_data = f5.get(mol)
if 'grid_points' in mol_data:
nx = mol_data['grid_points']['x'].shape[0]
ny = mol_data['grid_points']['y'].shape[0]
nz = mol_data['grid_points']['z'].shape[0]
self.shape = (nx, ny, nz)
else:
raise ValueError(
'Impossible to determine sparse grid shape.\\n Specify argument grid_shape=(x,y,z)')
[docs] def _extract_data(self):
"""Extract the data from the different maps."""
f5 = h5py.File(self.fname, 'r')
mol_names = list(f5.keys())
self.nmol = len(mol_names)
# loop over the molecules
for mol in mol_names:
# get the mapped features group
data_group = f5.get(mol + '/mapped_features/')
# loop over all the feature types
for feat_types, feat_names in data_group.items():
# if feature type not in param add
if feat_types not in self.parameters['features']:
self.parameters['features'][feat_types] = {}
# loop over all the feature
for name in feat_names:
# we skip the target
if name in self.skip_feature:
continue
# create the param if it doesn't already exists
if name not in self.parameters['features'][feat_types]:
self.parameters['features'][feat_types][name] = NormParam(
)
# load the matrix
feat_data = data_group[feat_types + '/' + name]
if feat_data.attrs['sparse']:
mat = sparse.FLANgrid(sparse=True,
index=feat_data['index'][:],
value=feat_data['value'][:],
shape=self.shape).to_dense()
else:
mat = feat_data['value'][:]
# add the parameter (mean and var)
self.parameters['features'][feat_types][name].add(
np.mean(mat), np.var(mat))
# get the target groups
target_group = f5.get(mol + '/targets')
# loop over all the targets
for tname, tval in target_group.items():
# we skip the already computed target
if tname in self.skip_target:
continue
# create a new item if needed
if tname not in self.parameters['targets']:
self.parameters['targets'][tname] = MinMaxParam()
# update the value
self.parameters['targets'][tname].update(tval[()])
f5.close()
[docs] def _process_data(self):
"""Compute the standard deviation of the data."""
for feat_types, feat_dict in self.parameters['features'].items():
for feat in feat_dict:
self.parameters['features'][feat_types][feat].process(
self.nmol)
[docs] def _export_data(self):
"""Pickle the data to file."""
f = open(self.fexport, 'wb')
pickle.dump(self.parameters, f)
f.close()
[docs]class NormParam(object):
def __init__(self, std=0, mean=0, var=0, sqmean=0):
"""Compute gaussian normalization for a given feature.
This class allows to extract the standard deviation, mean value, variance and square root of the
mean value of a mapped feature stored in the hdf5 file. As the entire data set is too large to fit in memory,
the standard deviation of a given feature is calculated from the std of all the individual grids. This is done following:
https://stats.stackexchange.com/questions/25848/how-to-sum-a-standard-deviation:
.. math::
\\sigma_{tot}=\\sqrt{\\frac{1}{N}\\sum_i \\sigma_i^2+\\frac{1}{N}\\sum_i\\mu_i^2-(\\frac{1}{N}\\sum_i\\mu_i)^2}
Args:
std (float, optional): standard deviation
mean (float,optional): mean value
var (float,optional): variance
sqmean (float, optional): square roo of the variance
"""
self.std = std
self.mean = mean
self.var = var
self.sqmean = sqmean
[docs] def add(self, mean, var):
"""Add the mean value, sqmean and variance of a new molecule to the
corresponding attributes."""
self.mean += mean
self.sqmean += mean**2
self.var += var
[docs] def process(self, n):
"""Compute the standard deviation of the ensemble."""
# normalize the mean and var
self.mean /= n
self.var /= n
self.sqmean /= n
# get the std
self.std = self.var
self.std += self.sqmean
self.std -= self.mean**2
self.std = np.sqrt(self.std)
[docs]class MinMaxParam(object):
"""Compute the min/max of an ensenble of data.
This is principally used to normalized the target values
Args:
minv (float, optional): minimal value
maxv (float, optional): maximal value
"""
def __init__(self, minv=None, maxv=None):
self.min = minv
self.max = maxv
[docs] def update(self, val):
if self.min is None:
self.min = val
self.max = val
else:
self.min = min(self.min, val)
self.max = max(self.max, val)