Team3-021714

From Computational Statistics Course Wiki
Jump to navigation Jump to search
A=fileread('SacCerChr4.txt');
As=sum(A=='A')
Cs=sum(A=='C')
Gs=sum(A=='G')
Ts=sum(A=='T')

N=length(A);
tot=As+Cs+Gs+Ts;
Ap=As/tot;
Cp=Cs/tot;
Gp=Gs/tot;
Tp=Ts/tot;

% TAG, TAA, TGA

probTAG=Tp*Ap*Gp;
probTAA=Tp*Ap*Ap;
probTGA=Tp*Gp*Ap;
totprob=probTAG+probTAA+probTGA;
k=0;
i=0;
n=0;
startstop=[];
pgntot=[];
while k<1
    i=i+1;
    x=3*(i-1)+1:3*i;
    if (sum(A(x)=='TAG')==3||sum(A(x)=='TAA')==3||sum(A(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;1 0 3*(i-n-1)+1 3*i];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i+length(A)<=2
        k=1;
    end
end   

k=0;
i=0;
n=0;
while k<1
    i=i+1;
    x=3*(i-1)+2:3*i+1;
    if (sum(A(x)=='TAG')==3||sum(A(x)=='TAA')==3||sum(A(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;1 1 3*(i-n-1)+2 3*i+1];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i-1+length(A)<=2
        k=1;
    end
end   

k=0;
i=0;
n=0;
while k<1
    i=i+1;
    x=3*(i-1)+3:3*i+2;
    if (sum(A(x)=='TAG')==3||sum(A(x)=='TAA')==3||sum(A(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;1 2 3*(i-n-1)+3 3*i+2];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i-2+length(A)<=2
        k=1;
    end
end   

B=char(zeros(size(A)));
for j=1:length(A)
    C=A(end-i+1);
    if C=='A'
        B(i)='T';
    elseif C=='T'
        B(i)='A';
    elseif C=='G'
        B(i)='C';
    else 
        B(i)='G';
    end
end

k=0;
i=0;
n=0;
while k<1
    i=i+1;
    x=3*(i-1)+1:3*i;
    if (sum(B(x)=='TAG')==3||sum(B(x)=='TAA')==3||sum(B(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;-1 0 3*(i-n-1)+1 3*i];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i+length(A)<=2
        k=1;
    end
end   

k=0;
i=0;
n=0;
while k<1
    i=i+1;
    x=3*(i-1)+2:3*i+1;
    if (sum(B(x)=='TAG')==3||sum(B(x)=='TAA')==3||sum(B(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;-1 1 3*(i-n-1)+2 3*i+1];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i-1+length(A)<=2
        k=1;
    end
end   

k=0;
i=0;
n=0;
while k<1
    i=i+1;
    x=3*(i-1)+3:3*i+2;
    if (sum(B(x)=='TAG')==3||sum(B(x)=='TAA')==3||sum(B(x)=='TGA')==3)
        %k=1;
        pgn=(1-totprob)^n;
        if pgn<.05
            startstop=[startstop;-1 2 3*(i-n-1)+3 3*i+2];
        end
        n=0;
        pgntot=[pgntot;pgn];
    else
        n=n+1;
    end
    if -3*i-2+length(A)<=2
        k=1;
    end
end  

startstop
hist(pgntot,1000);

Seg13ClassGroup3.jpg

A somewhat more refined version of the code:

% significance level
alpha = 0.05;

% load the file
fid = fopen('SacCerChr4.txt');
data = fscanf(fid,'%c');
fclose(fid);

% determine the other side of the strand in reverse order
data = [data' zeros(size(data))'];
for(i = 1:length(data))
    c = data(end-i+1,1);
    if(c == 'A')
        data(i,2) = 'T';
    elseif(c == 'T')
        data(i,2) = 'A';
    elseif(c == 'G')
        data(i,2) = 'C';
    else
        data(i,2) = 'G';
    end
end

% initialize arrays
lInc = 10000;
p = zeros(lInc,1);
pInd = 0;
proteins = zeros(lInc,4);
proteinsInd = 0;

% loop over each of the strands
for(s = 1:2)
    % compute the probability of A, T, G, and C
    pA = sum(data(:,s) == 'A')/length(data);
    pT = sum(data(:,s) == 'T')/length(data);
    pG = sum(data(:,s) == 'G')/length(data);
    pC = sum(data(:,s) == 'C')/length(data);
    
    % compute the probability of seeing a stop sequence (TAG,TAA,TGA)
    ps = pT*pA*pG + pT*pA*pA + pT*pG*pA;
    pns = 1-ps;
    
    % loop over each frame offset (0,1,2)
    for(o = 0:2)
        % loop over each codon
        N = 0;
        for(i = 1:floor((length(data)-o)/3))
            % check if the codon is a stop codon
            codon = data(3*(i-1)+o+1:3*i+o,s)';
            if(sum(codon == 'TAG') == 3 || sum(codon == 'TAA') == 3 ...
                    || sum(codon == 'TGA') == 3)
                % compute the p-value for this sequence
                pInd = pInd+1;
                p(pInd) = pns^N;
                
                % check if this p-value is significant
                if(p(pInd) < 0.05)
                    proteinsInd = proteinsInd+1;
                    proteins(proteinsInd,:) = [s o 3*(i-N-1)+o+1 3*i+o];
                end
                
                % increase size of arrays if necessary
                if(proteinsInd == length(proteins))
                    proteins = [proteins;zeros(lInc,4)];
                end
                if(pInd == length(p))
                    p = [p;zeros(lInc,1)];
                end
                
                % reset the counter for non-stop codons
                N = 0;
            else
                % iterate the counter for non-stop codons
                N = N+1;
            end
        end
    end
end

% trim the arrays
p = p(1:pInd);
proteins = proteins(1:proteinsInd,:);

% plot a histogram of p-values
figure
hist(p,1000);
title('Histogram of the p-values')
xlabel('p-value')
ylabel('Number of Sequences')

--Daniel Shepard