"""
This module contains estimation algorithms for the
covariance matrix based on a finite number of samples.
For the testing suits, please turn to "imagine/tests/tools_tests.py".
"""
# %% IMPORTS
# Built-in imports
import logging as log
# Package imports
import numpy as np
# IMAGINE imports
from imagine.tools.parallel_ops import (
pmean, ptrans, pmult, peye, ptrace, pshape)
from imagine.tools.config import rc
# All declaration
__all__ = ['empirical_cov', 'empirical_mcov', 'oas_cov', 'oas_mcov',
'diagonal_cov', 'diagonal_mcov']
# %% FUNCTION DEFINITIONS
[docs]def diagonal_cov(data):
"""
Assumes the covariance matrix is simply a diagonal matrix whose values
correspond to the sample variances
Parameters
----------
data : numpy.ndarray
Ensemble of observables, in global shape (ensemble size, data size).
Returns
-------
cov : numpy.ndarray
Covariance matrix
"""
# MPI version still to be implemented in the future
if rc['distributed_arrays']:
raise NotImplementedError
return np.diag(data.var(axis=0))
[docs]def diagonal_mcov(data):
"""
Assumes the covariance matrix is simply a diagonal matrix whose values
correspond to the sample variances
Parameters
----------
data : numpy.ndarray
Ensemble of observables, in global shape (ensemble size, data size).
Returns
-------
mean : numpy.ndarray
Ensemble mean
cov : numpy.ndarray
Covariance matrix
"""
return pmean(data), diagonal_cov(data)
[docs]def empirical_cov(data):
r"""
Empirical covariance estimator
Given some data matrix, :math:`D`, where rows are different samples
and columns different properties, the covariance can be
estimated from
.. math::
U_{ij} = D_{ij} - \overline{D}_j\,,\;
\text{with}\; \overline{D}_j=\tfrac{1}{N} \sum_{i=1}^N D_{ij}
.. math::
\text{cov} = \tfrac{1}{N} U^T U
Notes
-----
While conceptually simple, this is usually not the
best option.
Parameters
----------
data : numpy.ndarray
Ensemble of observables, in global shape (ensemble size, data size).
Returns
-------
cov : numpy.ndarray
Distributed (not copied) covariance matrix in global shape (data size, data size),
each node takes part of the rows.
"""
log.debug('@ covariance_estimator::empirical_cov')
_, cov = empirical_mcov(data)
return cov
[docs]def empirical_mcov(data):
r"""
Empirical covariance estimator
Given some data matrix, :math:`D`, where rows are different samples
and columns different properties, the covariance can be
estimated from
.. math::
U_{ij} = D_{ij} - \overline{D}_j\,,\;
\text{with}\; \overline{D}_j=\tfrac{1}{N} \sum_{i=1}^N D_{ij}
.. math::
\text{cov} = \tfrac{1}{N} U^T U
Notes
-----
While conceptually simple, this is usually not the
best option.
Parameters
----------
data : numpy.ndarray
Ensemble of observables, in global shape (ensemble size, data size).
Returns
-------
mean : numpy.ndarray
Copied ensemble mean (on all nodes).
cov : numpy.ndarray
Distributed (not copied) covariance matrix in global shape (data size, data size),
each node takes part of the rows.
"""
log.debug('@ covariance_estimator::empirical_mcov')
assert isinstance(data, np.ndarray)
assert (len(data.shape) == 2)
# Get ensemble size (i.e. the number of rows)
ensemble_size, _ = pshape(data)
# Calculates covariance
mean = pmean(data)
u = data - mean
cov = pmult(ptrans(u), u) / ensemble_size
return mean, cov
[docs]def oas_cov(data):
r"""
Estimate covariance with the Oracle Approximating Shrinkage algorithm.
Given some :math:`n\times m` data matrix, :math:`D`,
where rows are different samples and columns different properties,
the covariance can be estimated in the following way.
.. math::
U_{ij} = D_{ij} - \overline{D}_j\,,\;
\text{with}\; \overline{D}_j=\tfrac{1}{n} \sum_{i=1}^n D_{ij}
Let
.. math::
S = \tfrac{1}{n} U^T U\,,\;
T = \text{tr}(S)\quad\text{and}\quad V = \text{tr}(S^2)
.. math::
\tilde\rho = \min\left[1,\frac{(1-2/m)V + T^2}{ (n+1-2/m)(V-T^2/m)}\right]
The covariance is given by
.. math::
\text{cov}_\text{OAS} = (1-\rho)S + \tfrac{1}{N} \rho T I_m
Parameters
----------
data : numpy.ndarray
Distributed data in global shape (ensemble_size, data_size).
Returns
-------
cov : numpy.ndarray
Covariance matrix in global shape (data_size, data_size).
"""
log.debug('@ covariance_estimator::oas_cov')
_, cov = oas_mcov(data)
return cov
[docs]def oas_mcov(data):
"""
Estimate covariance with the Oracle Approximating Shrinkage algorithm.
See `imagine.tools.covariance_estimator.oas_cov` for details. This
function aditionally returns the computed ensemble mean.
Parameters
----------
data : numpy.ndarray
Distributed data in global shape (ensemble_size, data_size).
Returns
-------
mean : numpy.ndarray
Copied ensemble mean (on all nodes).
cov : numpy.ndarray
Distributed covariance matrix in shape (data_size, data_size).
"""
log.debug('@ covariance_estimator::oas_mcov')
assert isinstance(data, np.ndarray)
assert (len(data.shape) == 2)
# Finds ensemble size and data size
data_size = data.shape[1]
ensemble_size, _ = pshape(data)
# Calculates OAS covariance extimator from empirical covariance estimator
mean = pmean(data)
u = data - mean
s = pmult(ptrans(u), u) / ensemble_size
trs = ptrace(s)
trs2 = ptrace(pmult(s, s))
numerator = (1.0 - 2.0/data_size)*trs2 + trs*trs
denominator = (ensemble_size +1.0-2.0/data_size)*(trs2 - (trs*trs)/data_size)
if denominator == 0:
rho = 1
else:
rho = np.min([1, numerator/denominator])
cov = (1.-rho)*s+peye(data_size)*rho*trs/data_size
return mean, cov