# randcorr.py - gets x,y data from file named
#    by user, calculates correlation and a randomly
#    -assigned distribution of this coefficient as
#    a test of its significance
# cmc, 5/12/2015

import math
import random

# getData - asks user for filename, then
#    reads x,y data from the file
# returns list of x,y tuples
def getData():
    filename = input("enter input file name --> ")
    dataFile = open(filename, "r")
    data = []
    for line in dataFile:
        xyStrings = line.split()
        x = float(xyStrings[0])
        y = float(xyStrings[1])
        data.append( (x,y) )
    dataFile.close()
    return data


# correlation coefficient with just one pass through the data
# assumes xydata is a list of x,y tuples
def corr(xydata):

    # first find sums, sums of squares and sum of cross products
    sumX = sumX2 = sumY = sumY2 = sumproducts = 0.0
    for pair in xydata:
        sumX += pair[0]
        sumX2 += pair[0] * pair[0]
        sumY += pair[1]
        sumY2 += pair[1] * pair[1]
        sumproducts += pair[0] * pair[1]

    # find mean and standard deviation for x and y
    n = len(xydata)
    nn1 = n * (n - 1)
    xmean = sumX / n
    xstdev = math.sqrt( (n * sumX2 - sumX*sumX) / nn1 )
    ymean = sumY / n
    ystdev = math.sqrt( (n * sumY2 - sumY*sumY) / nn1 )

    # find and return correlation
    return (sumproducts - n * xmean * ymean) / ((n - 1) * xstdev * ystdev)


# calculates nperms sums of cross-products in xydata (including the
# original one plus nperms-1 by random permutations of the y values)
# Returns count of sums greater than the original one.
# assumes xydata is a list of x,y tuples
def countGreater(xydata, nperms):

    # first copy data into two lists and count items
    x = []
    y = []
    n = 0
    for pair in xydata:
        x.append(pair[0])
        y.append(pair[1])
        n += 1

    # a local (inner) function that uses outer function variables
    def products():
        result = 0
        for i in range(n):
            result += x[i] * y[i]
        return result

    # now calculate the sums, starting with the original order
    # and the random permutations, and then return results
    original = products()
    greater = 0
    for i in range(nperms-1):
        random.shuffle(y)
        sumproducts = products()

        if sumproducts > original:
            greater += 1

    return greater

def main():
    data = getData()
    print('\nCorrelation (r) = {0:.2f}'.format(corr(data)))
    
    answer = input('\nEstimate significance using randomization? ')
    if answer[0].lower() == 'y':
        nperms = 10000
        greater = countGreater(data,nperms)
        print('\t{2} (found {0} greater out of {1} permutations)'.
              format(greater, nperms, greater/nperms))

main()