(DT) Segment 29: GMMs in N-Dimensions

From Computational Statistics Course Wiki
Jump to navigation Jump to search

Code for fitting using GMM

   function [mu, sigma, err, it, iterdata] = fitGaussMM(data,ncomp,mu,sigma,popfrac,maxit,tol)
   
   % dimension of problem
   dim = size(data,2);
   % number of data points
   nd = size(data,1);
   
   % initial starting guesses for mu and sigma for the ncomp Gaussians to
   % be used in the mixture model
   if isempty(mu)
       mu = data(randsample(nd,ncomp),:);
   end
   if isempty(sigma)
       for c = 1:ncomp
           sigma(:,:,c) = rand(dim,dim);
           sigma(:,:,c) = sigma(:,:,c)'*sigma(:,:,c);
       end
   end
   % initial population fraction guess
   if isempty(popfrac)
       popfrac = ones(1,ncomp)/ncomp;
   end
   
   % maximum number of iterations
   if isempty(maxit)
       maxit = 20;
   end
   
   % stopping criterion 
   if isempty(tol)
       tol = 1e-5;
   end
   
   % data points at which residual/error is calculated
   np = 100;
   xe = zeros(np,dim);
   for d = 1:dim
       xe(:,d) = linspace(min(data(:,d)),max(data(:,d)),np)';
   end
   
   % normalized probability array
   prns = zeros(nd,ncomp);
   prns_e = zeros(np,ncomp);
   % iteration counter
   it = 1;
   % norm(change from last time)
   err = 1;
   % function for calculating diagonal of data'*sigma*data
   diagm = @(data,mu,sigma) sum(((data-ones(size(data))*spdiags(mu(:),0,dim,dim))/(chol(sigma,'Lower'))).^2,2);
   % initial residual
   res = zeros(np,1);
   % iteration data
   iterdata = cell(maxit,3);
   
   while ((it <= maxit) && err > tol)
       %%% E-Step
       pr = @(data,k) popfrac(1,k)*exp(-0.5*diagm(data,mu(k,:),sigma(:,:,k)))./(sqrt(det(sigma(:,:,k)))*((2*pi)^(dim/2)));
       for k = 1:ncomp
           prns(:,k) = pr(data,k);
       end
       prns_n = spdiags(1./sum(prns,2),0,nd,nd)*prns;
       %%% M-Step
       for k = 1:ncomp
           mu(k,:) = sum(spdiags(prns_n(:,k),0,nd,nd)*data,1)/sum(prns_n(:,k));
           temp = data - ones(size(data))*spdiags(mu(k,:)',0,dim,dim);
           xmmu = (spdiags(prns_n(:,k),0,nd,nd)*temp).'*temp;
           sigma(:,:,k) = xmmu/sum(prns_n(:,k));
       end
       popfrac = sum(prns_n,1)/nd;
       
       % error calculation
       for k = 1:ncomp
           prns_e(:,k) = pr(xe,k);
       end
       thefunc = sum(prns_e*diag(popfrac),2);
       temp = thefunc-res;
       err = norm(temp);
       res = temp + res;
       
       % save all iteration data
       iterdata(it,:) = {it,mu,sigma};
       
       % update iteration counter
       it = it + 1;
   end
   
   it = it-1;
   iterdata = iterdata(1:it,:);
   
   
   end

Code for fitting using k-means Clustering

   function [mu, clusters, err, it, iterdata] = kMeanClustering(data,ncomp,mu,maxit,tol)
   
   % dimension of problem
   dim = size(data,2);
   % number of data points
   nd = size(data,1);
   
   % initial starting guesses for mu for the ncomp model
   if isempty(mu)
       mu = data(randsample(nd,ncomp),:);
   end
   
   % maximum number of iterations
   if isempty(maxit)
       maxit = 20;
   end
   
   % stopping criterion 
   if isempty(tol)
       tol = 1e-5;
   end
   
   % initial population fraction
   popfrac = rand(1,ncomp); popfrac = popfrac/sum(popfrac);
   % normalized probability array
   prns = zeros(nd,ncomp);
   % iteration counter
   it = 1;
   % norm(change from last time)
   err = 1;
   % matrix of all 1s
   onem = ones(nd,dim);
   % initial residual
   res = zeros(nd,1);
   % iteration data
   iterdata = cell(maxit,3);
   
   while ((it <= maxit) && err > tol)
       %%% E-Step
       pr = @(k) popfrac(1,k)*1./(sum((data - onem*spdiags(mu(k,:)',0,dim,dim)).^2,2)+eps);
       for k = 1:ncomp
           prns(:,k) = pr(k);
       end
       prns_n = spdiags(1./sum(prns,2),0,nd,nd)*prns;
       %%% M-Step
       for k = 1:ncomp
           mu(k,:) = sum(spdiags(prns_n(:,k),0,nd,nd)*data,1)/sum(prns_n(:,k));
       end
       popfrac = sum(prns_n,1)/nd;
       thefunc = sum(prns*diag(popfrac),2);
       temp = thefunc-res;
       err = norm(temp);
       res = temp + res;
       [~,clusters] = max(prns_n,[],2);
       
       % save all iteration data
       iterdata(it,:) = {it,mu,clusters};
       
       % update iteration counter
       it = it + 1;
   end
   
   it = it-1;
   % assignment of datapoints
   [~,clusters] = max(prns_n,[],2);
   iterdata = iterdata(1:it,:);
   
   end