# /Segment34

## To Calculate

1. Use the permutation test to decide whether the contingency table $\begin{bmatrix} 5 & 3 & 2\\ 2 & 3 & 6 \\ 0 & 2 & 3 \end{bmatrix}$ shows a significant association. What is the p-value?

2. Repeat the calculation using the Pearson chi-square statistic instead of the Wald statistic, or vice versa.

I don't have the option of setting 0/0=0 in the python I work with... With Pearson Chi-square, I got p-value=0.242.


import numpy as np
import random
import matplotlib.pyplot as plt
f=[5,3,2]
g=[2,3,6]
h=[0,2,3]
m=np.array([f,g,h])

def master(m, number, type):
rows, columns=mdecomp(m)
trueval=contingency_table(m)
print trueval
k=[]
f=0
for i in range(number):
newrows=shuf(rows)
table=count(newrows, columns)
print table
k.append(contingency_table(table))
for i in range(len(k)):
if k[i]> trueval:
f+=1
plt.hist(k, bins=50)
plt.show()
return float(f)/float(number)

def mdecomp(m):
colsum= m.sum(axis=0)
rowsum=m.sum(axis=1)
columns='a'*colsum[0]+'b'*colsum[1]+'c'*colsum[2]
rows='f'*rowsum[0]+'g'*rowsum[1]+'h'*rowsum[2]
return rows, columns

def shuf(row):
newrow=''.join(random.sample(row, len(row)))
return newrow

def colcount(i,columns):
a=b=c=0
if columns[i]=='a':
a+=1
if columns[i]=='b':
b+=1
if columns[i]=='c':
c+=1
return [a, b, c]

def count(rows, columns):
row_1=row_2=row_3=[0,0,0]
for i in range(len(rows)):
if rows[i]=='f':
row_1=[row_1[j]+colcount(i,columns)[j] for j in range(3)]
if rows[i]=='g':
row_2=[row_2[j]+colcount(i,columns)[j] for j in range(3)]
if rows[i]=='h':
row_3=[row_3[j]+colcount(i,columns)[j] for j in range(3)]
return np.array([row_1, row_2, row_3])

def contingency_table(table):#This function is Jeff's code
col_sums = table.sum(axis=0)
row_sums = table.sum(axis=1)
N = table.sum()
expected = np.outer(row_sums, col_sums) / float(N)
chisqs = (table - expected)**2 /(expected+1) #I added "+1" because I don't have 0/0=0
chisq = chisqs.sum()
dofs = np.prod(table.shape) - sum(table.shape) + 1
p_val = 1 - scipy.stats.chi2(dofs).cdf(chisq)
return chisq

print master(m, 1000)
print mdecomp(m)



1. Is slide's 7 suggestion, that you figure out how to implement the permutation test without "expanding all the data", actually possible? If so, what is your method?

I would set a threshold for the counts of each category, say only report the count in the expanded vectors every 10 counts (and this threshold value can vary depending on the precision we want and the highest frequency in the table.)