Source code for tesfdmtools.methods.PCA

#!/usr/bin/env python3
#
'''
.. module:: PCA
   :synopsis: Principal Component Analysis (PCA) 
.. moduleauthor:: Cor de Vries <c.p.de.vries@sron.nl>
'''

import sys,os
import numpy

import matplotlib
matplotlib.use('GTK3Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from numpy import linalg as LA

[docs]def cov(m, y=None, rowvar=1, bias=0, ddof=None):                                                                           
    """                                                                                                                    
    Estimate a covariance matrix, given data.                                                                              
                                                                                                                           
    Covariance indicates the level to which two variables vary together.                                                   
    If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`,                                                
    then the covariance matrix element :math:`C_{ij}` is the covariance of                                                 
    :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance                                                
    of :math:`x_i`.

    Args:
       *      `m` = array_like
                    A 1-D or 2-D array containing multiple variables and observations.
                    Each row of `m` represents a variable, and each column a single
                    observation of all those variables. Also see `rowvar` below.
       *      `y` = array_like, optional
                    An additional set of variables and observations. `y` has the same
                    form as that of `m`.
       * `rowvar` = int, optional
                    If `rowvar` is non-zero (default), then each row represents a
                    variable, with observations in the columns. Otherwise, the relationship
                    is transposed: each column represents a variable, while the rows
                    contain observations.
       *   `bias` = int, optional
                    Default normalization is by ``(N - 1)``, where ``N`` is the number of
                    observations given (unbiased estimate). If `bias` is 1, then
                    normalization is by ``N``. These values can be overridden by using
                    the keyword ``ddof`` in numpy versions >= 1.5.
       *   `ddof` = int, optional
                    .. versionadded:: 1.5
                    If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is
                    the number of observations; this overrides the value implied by
                    ``bias``. The default value is ``None``.

    Returns:
       * `out` = ndarray
                 The covariance matrix of the variables.

    See Also
    --------
    corrcoef : Normalized covariance matrix

    Examples
    --------
    Consider two variables, :math:`x_0` and :math:`x_1`, which
    correlate perfectly, but in opposite directions:

    >>> x = np.array([[0, 2], [1, 1], [2, 0]]).T
    >>> x
    array([[0, 1, 2],
           [2, 1, 0]])

    Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance
    matrix shows this clearly:

    >>> np.cov(x)
    array([[ 1., -1.],
           [-1.,  1.]])

    Note that element :math:`C_{0,1}`, which shows the correlation between
    :math:`x_0` and :math:`x_1`, is negative.

    Further, note how `x` and `y` are combined:

    >>> x = [-2.1, -1,  4.3]
    >>> y = [3,  1.1,  0.12]
    >>> X = np.vstack((x,y))
    >>> print np.cov(X)
    [[ 11.71        -4.286     ]
     [ -4.286        2.14413333]]
    >>> print np.cov(x, y)
    [[ 11.71        -4.286     ]
     [ -4.286        2.14413333]]
    >>> print np.cov(x)
    11.71

    """
    # Check inputs
    if ddof is not None and ddof != int(ddof):
        raise ValueError("ddof must be integer")

    X = numpy.array(m, ndmin=2)
    if X.size == 0:
        # handle empty arrays
        return numpy.array(m)
    if X.shape[0] == 1:
        rowvar = 1
    if rowvar:
        axis = 0
        tup = (slice(None), numpy.newaxis)
    else:
        axis = 1
        tup = (numpy.newaxis, slice(None))


    if y is not None:
        y = numpy.array(y, copy=False, ndmin=2)
        X = concatenate((X, y), axis)

    X -= X.mean(axis=1-axis)[tup]
    if rowvar:
        N = X.shape[1]
    else:
        N = X.shape[0]

    if ddof is None:
        if bias == 0:
            ddof = 1
        else:
            ddof = 0
    fact = float(N - ddof)

    if not rowvar:
        return (numpy.dot(X.T, X.conj()) / fact).squeeze()
    else:
        return (numpy.dot(X, X.T.conj()) / fact).squeeze()

[docs]class PCA:
   '''
   Class for PCA analysis
   '''

   def __init__( self, n_components=10 ):

      self.components=n_components

[docs]   def fit_transform(self,data):
       '''
       Enter data into the PCA analysis. 

       Args:
          * `data` = matrix with data: data(jrow,iax)
             * `jrow` = data record number
             * `iax`  = data record axis

       Returns:
          * `data` = transformed data in eigenvector space

       Properties:
          *               `components_` = eigenvectors 
          * `explained_variance_ratio_` = relative weights of eigenvectors
       '''

       m, n = data.shape
       data -= data.mean(axis=0)                        # mean center the data

       R = cov(data, rowvar=False)                      # calculate the covariance matrix

# calculate eigenvectors & eigenvalues of the covariance matrix
# use 'eigh' rather than 'eig' since R is symmetric, 
# the performance gain is substantial

       evals, evecs = LA.eigh(R)
       idx = numpy.argsort(evals)[::-1]                    # sort eigenvalue in decreasing order
       evecs = evecs[:,idx]
       evals = evals[idx]                                # sort eigenvectors according to same index
       self.components_ = (evecs[:, :self.components]).T # select the first n eigenvectors (n is desired dimension
                                                         # of rescaled data array, or n_components)

# carry out the transformation on the data using eigenvectors
# and return the re-scaled data, eigenvalues, and eigenvectors

       self.explained_variance_ratio_=evals[:self.components]
       self.explained_variance_ratio_=self.explained_variance_ratio_/self.explained_variance_ratio_.sum()

       return numpy.dot(self.components_, data.T).T