# randcorr.py - gets x,y data from file named # by user, calculates correlation and a randomly # -assigned distribution of this coefficient as # a test of its significance # cmc, 5/12/2015 import math import random # getData - asks user for filename, then # reads x,y data from the file # returns list of x,y tuples def getData(): filename = input("enter input file name --> ") dataFile = open(filename, "r") data = [] for line in dataFile: xyStrings = line.split() x = float(xyStrings[0]) y = float(xyStrings[1]) data.append( (x,y) ) dataFile.close() return data # correlation coefficient with just one pass through the data # assumes xydata is a list of x,y tuples def corr(xydata): # first find sums, sums of squares and sum of cross products sumX = sumX2 = sumY = sumY2 = sumproducts = 0.0 for pair in xydata: sumX += pair[0] sumX2 += pair[0] * pair[0] sumY += pair[1] sumY2 += pair[1] * pair[1] sumproducts += pair[0] * pair[1] # find mean and standard deviation for x and y n = len(xydata) nn1 = n * (n - 1) xmean = sumX / n xstdev = math.sqrt( (n * sumX2 - sumX*sumX) / nn1 ) ymean = sumY / n ystdev = math.sqrt( (n * sumY2 - sumY*sumY) / nn1 ) # find and return correlation return (sumproducts - n * xmean * ymean) / ((n - 1) * xstdev * ystdev) # calculates nperms sums of cross-products in xydata (including the # original one plus nperms-1 by random permutations of the y values) # Returns count of sums greater than the original one. # assumes xydata is a list of x,y tuples def countGreater(xydata, nperms): # first copy data into two lists and count items x = [] y = [] n = 0 for pair in xydata: x.append(pair[0]) y.append(pair[1]) n += 1 # a local (inner) function that uses outer function variables def products(): result = 0 for i in range(n): result += x[i] * y[i] return result # now calculate the sums, starting with the original order # and the random permutations, and then return results original = products() greater = 0 for i in range(nperms-1): random.shuffle(y) sumproducts = products() if sumproducts > original: greater += 1 return greater def main(): data = getData() print('\nCorrelation (r) = {0:.2f}'.format(corr(data))) answer = input('\nEstimate significance using randomization? ') if answer[0].lower() == 'y': nperms = 10000 greater = countGreater(data,nperms) print('\t{2} (found {0} greater out of {1} permutations)'. format(greater, nperms, greater/nperms)) main()