Source code for tesfdmtools.methods.PCA

#!/usr/bin/env python3
#
'''
.. module:: PCA
   :synopsis: Principal Component Analysis (PCA) 
.. moduleauthor:: Cor de Vries <c.p.de.vries@sron.nl>
'''

import sys,os
import numpy

import matplotlib
matplotlib.use('GTK3Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

from numpy import linalg as LA

[docs]def cov(m, y=None, rowvar=1, bias=0, ddof=None): """ Estimate a covariance matrix, given data. Covariance indicates the level to which two variables vary together. If we examine N-dimensional samples, :math:`X = [x_1, x_2, ... x_N]^T`, then the covariance matrix element :math:`C_{ij}` is the covariance of :math:`x_i` and :math:`x_j`. The element :math:`C_{ii}` is the variance of :math:`x_i`. Args: * `m` = array_like A 1-D or 2-D array containing multiple variables and observations. Each row of `m` represents a variable, and each column a single observation of all those variables. Also see `rowvar` below. * `y` = array_like, optional An additional set of variables and observations. `y` has the same form as that of `m`. * `rowvar` = int, optional If `rowvar` is non-zero (default), then each row represents a variable, with observations in the columns. Otherwise, the relationship is transposed: each column represents a variable, while the rows contain observations. * `bias` = int, optional Default normalization is by ``(N - 1)``, where ``N`` is the number of observations given (unbiased estimate). If `bias` is 1, then normalization is by ``N``. These values can be overridden by using the keyword ``ddof`` in numpy versions >= 1.5. * `ddof` = int, optional .. versionadded:: 1.5 If not ``None`` normalization is by ``(N - ddof)``, where ``N`` is the number of observations; this overrides the value implied by ``bias``. The default value is ``None``. Returns: * `out` = ndarray The covariance matrix of the variables. See Also -------- corrcoef : Normalized covariance matrix Examples -------- Consider two variables, :math:`x_0` and :math:`x_1`, which correlate perfectly, but in opposite directions: >>> x = np.array([[0, 2], [1, 1], [2, 0]]).T >>> x array([[0, 1, 2], [2, 1, 0]]) Note how :math:`x_0` increases while :math:`x_1` decreases. The covariance matrix shows this clearly: >>> np.cov(x) array([[ 1., -1.], [-1., 1.]]) Note that element :math:`C_{0,1}`, which shows the correlation between :math:`x_0` and :math:`x_1`, is negative. Further, note how `x` and `y` are combined: >>> x = [-2.1, -1, 4.3] >>> y = [3, 1.1, 0.12] >>> X = np.vstack((x,y)) >>> print np.cov(X) [[ 11.71 -4.286 ] [ -4.286 2.14413333]] >>> print np.cov(x, y) [[ 11.71 -4.286 ] [ -4.286 2.14413333]] >>> print np.cov(x) 11.71 """ # Check inputs if ddof is not None and ddof != int(ddof): raise ValueError("ddof must be integer") X = numpy.array(m, ndmin=2) if X.size == 0: # handle empty arrays return numpy.array(m) if X.shape[0] == 1: rowvar = 1 if rowvar: axis = 0 tup = (slice(None), numpy.newaxis) else: axis = 1 tup = (numpy.newaxis, slice(None)) if y is not None: y = numpy.array(y, copy=False, ndmin=2) X = concatenate((X, y), axis) X -= X.mean(axis=1-axis)[tup] if rowvar: N = X.shape[1] else: N = X.shape[0] if ddof is None: if bias == 0: ddof = 1 else: ddof = 0 fact = float(N - ddof) if not rowvar: return (numpy.dot(X.T, X.conj()) / fact).squeeze() else: return (numpy.dot(X, X.T.conj()) / fact).squeeze()
[docs]class PCA: ''' Class for PCA analysis ''' def __init__( self, n_components=10 ): self.components=n_components
[docs] def fit_transform(self,data): ''' Enter data into the PCA analysis. Args: * `data` = matrix with data: data(jrow,iax) * `jrow` = data record number * `iax` = data record axis Returns: * `data` = transformed data in eigenvector space Properties: * `components_` = eigenvectors * `explained_variance_ratio_` = relative weights of eigenvectors ''' m, n = data.shape data -= data.mean(axis=0) # mean center the data R = cov(data, rowvar=False) # calculate the covariance matrix # calculate eigenvectors & eigenvalues of the covariance matrix # use 'eigh' rather than 'eig' since R is symmetric, # the performance gain is substantial evals, evecs = LA.eigh(R) idx = numpy.argsort(evals)[::-1] # sort eigenvalue in decreasing order evecs = evecs[:,idx] evals = evals[idx] # sort eigenvectors according to same index self.components_ = (evecs[:, :self.components]).T # select the first n eigenvectors (n is desired dimension # of rescaled data array, or n_components) # carry out the transformation on the data using eigenvectors # and return the re-scaled data, eigenvalues, and eigenvectors self.explained_variance_ratio_=evals[:self.components] self.explained_variance_ratio_=self.explained_variance_ratio_/self.explained_variance_ratio_.sum() return numpy.dot(self.components_, data.T).T