Source code for imagine.tools.covariance_estimator

"""
This module contains estimation algorithms for the
covariance matrix based on a finite number of samples.

For the testing suits, please turn to "imagine/tests/tools_tests.py".
"""

# %% IMPORTS
# Built-in imports
import logging as log

# Package imports
import numpy as np

# IMAGINE imports
from imagine.tools.parallel_ops import (
    pmean, ptrans, pmult, peye, ptrace, pshape)
from imagine.tools.config import rc

# All declaration
__all__ = ['empirical_cov', 'empirical_mcov', 'oas_cov', 'oas_mcov',
           'diagonal_cov', 'diagonal_mcov']


# %% FUNCTION DEFINITIONS
[docs]def diagonal_cov(data): """ Assumes the covariance matrix is simply a diagonal matrix whose values correspond to the sample variances Parameters ---------- data : numpy.ndarray Ensemble of observables, in global shape (ensemble size, data size). Returns ------- cov : numpy.ndarray Covariance matrix """ # MPI version still to be implemented in the future if rc['distributed_arrays']: raise NotImplementedError return np.diag(data.var(axis=0))
[docs]def diagonal_mcov(data): """ Assumes the covariance matrix is simply a diagonal matrix whose values correspond to the sample variances Parameters ---------- data : numpy.ndarray Ensemble of observables, in global shape (ensemble size, data size). Returns ------- mean : numpy.ndarray Ensemble mean cov : numpy.ndarray Covariance matrix """ return pmean(data), diagonal_cov(data)
[docs]def empirical_cov(data): r""" Empirical covariance estimator Given some data matrix, :math:`D`, where rows are different samples and columns different properties, the covariance can be estimated from .. math:: U_{ij} = D_{ij} - \overline{D}_j\,,\; \text{with}\; \overline{D}_j=\tfrac{1}{N} \sum_{i=1}^N D_{ij} .. math:: \text{cov} = \tfrac{1}{N} U^T U Notes ----- While conceptually simple, this is usually not the best option. Parameters ---------- data : numpy.ndarray Ensemble of observables, in global shape (ensemble size, data size). Returns ------- cov : numpy.ndarray Distributed (not copied) covariance matrix in global shape (data size, data size), each node takes part of the rows. """ log.debug('@ covariance_estimator::empirical_cov') _, cov = empirical_mcov(data) return cov
[docs]def empirical_mcov(data): r""" Empirical covariance estimator Given some data matrix, :math:`D`, where rows are different samples and columns different properties, the covariance can be estimated from .. math:: U_{ij} = D_{ij} - \overline{D}_j\,,\; \text{with}\; \overline{D}_j=\tfrac{1}{N} \sum_{i=1}^N D_{ij} .. math:: \text{cov} = \tfrac{1}{N} U^T U Notes ----- While conceptually simple, this is usually not the best option. Parameters ---------- data : numpy.ndarray Ensemble of observables, in global shape (ensemble size, data size). Returns ------- mean : numpy.ndarray Copied ensemble mean (on all nodes). cov : numpy.ndarray Distributed (not copied) covariance matrix in global shape (data size, data size), each node takes part of the rows. """ log.debug('@ covariance_estimator::empirical_mcov') assert isinstance(data, np.ndarray) assert (len(data.shape) == 2) # Get ensemble size (i.e. the number of rows) ensemble_size, _ = pshape(data) # Calculates covariance mean = pmean(data) u = data - mean cov = pmult(ptrans(u), u) / ensemble_size return mean, cov
[docs]def oas_cov(data): r""" Estimate covariance with the Oracle Approximating Shrinkage algorithm. Given some :math:`n\times m` data matrix, :math:`D`, where rows are different samples and columns different properties, the covariance can be estimated in the following way. .. math:: U_{ij} = D_{ij} - \overline{D}_j\,,\; \text{with}\; \overline{D}_j=\tfrac{1}{n} \sum_{i=1}^n D_{ij} Let .. math:: S = \tfrac{1}{n} U^T U\,,\; T = \text{tr}(S)\quad\text{and}\quad V = \text{tr}(S^2) .. math:: \tilde\rho = \min\left[1,\frac{(1-2/m)V + T^2}{ (n+1-2/m)(V-T^2/m)}\right] The covariance is given by .. math:: \text{cov}_\text{OAS} = (1-\rho)S + \tfrac{1}{N} \rho T I_m Parameters ---------- data : numpy.ndarray Distributed data in global shape (ensemble_size, data_size). Returns ------- cov : numpy.ndarray Covariance matrix in global shape (data_size, data_size). """ log.debug('@ covariance_estimator::oas_cov') _, cov = oas_mcov(data) return cov
[docs]def oas_mcov(data): """ Estimate covariance with the Oracle Approximating Shrinkage algorithm. See `imagine.tools.covariance_estimator.oas_cov` for details. This function aditionally returns the computed ensemble mean. Parameters ---------- data : numpy.ndarray Distributed data in global shape (ensemble_size, data_size). Returns ------- mean : numpy.ndarray Copied ensemble mean (on all nodes). cov : numpy.ndarray Distributed covariance matrix in shape (data_size, data_size). """ log.debug('@ covariance_estimator::oas_mcov') assert isinstance(data, np.ndarray) assert (len(data.shape) == 2) # Finds ensemble size and data size data_size = data.shape[1] ensemble_size, _ = pshape(data) # Calculates OAS covariance extimator from empirical covariance estimator mean = pmean(data) u = data - mean s = pmult(ptrans(u), u) / ensemble_size trs = ptrace(s) trs2 = ptrace(pmult(s, s)) numerator = (1.0 - 2.0/data_size)*trs2 + trs*trs denominator = (ensemble_size +1.0-2.0/data_size)*(trs2 - (trs*trs)/data_size) if denominator == 0: rho = 1 else: rho = np.min([1, numerator/denominator]) cov = (1.-rho)*s+peye(data_size)*rho*trs/data_size return mean, cov