/** * File Name: pdyn_mm.c * Sample matrix multiplication code using pthreads and self-scheduling. * * To compile: * cc pdyn_mm.c -o pdyn_mm -lpthread * * -Hong Tang, May. 1999 */ # include # include # include # include double gettime() { struct timeval t; gettimeofday(&t, NULL); return (double)t.tv_sec+t.tv_usec*0.000001; } # define MIN(a,b) (((a)>(b))?(b):(a)) static int n; /* size of the matrix */ static int bsize; /* size of the sub-matrix */ static int barea; /* ==bsize*bsize */ static int bnum; /* number of sub-matrix columns/rows in the matrix */ static int nproc; /* number of POSIX threads */ static int d; /* granularity */ static double *A, *B, *C; static pthread_mutex_t lock; static int tasks=0; /* next available task id */ /* get the starting address of submatrix A[i][j] */ double *MAP_BLK_A(int i, int j) { return A+(i*bnum+j)*barea; } /* get the starting address of submatrix B[i][j] */ double *MAP_BLK_B(int i, int j) { return B+(j*bnum+i)*barea; } /* get the starting address of submatrix C[i][j] */ double *MAP_BLK_C(int i, int j) { return C+(i*bnum+j)*barea; } /* get the address of element A[k][l] */ double * MAP_ELEM_A(int k, int l) { int bi=k/bsize, bj=l/bsize; int blk_i=k%bsize, blk_j=l%bsize; return MAP_BLK_A(bi, bj)+blk_i*bsize+blk_j; } /* get the address of element B[k][l] */ double * MAP_ELEM_B(int k, int l) { int bi=k/bsize, bj=l/bsize; int blk_i=k%bsize, blk_j=l%bsize; return MAP_BLK_B(bi, bj)+blk_i*bsize+blk_j; } /* get the address of element C[k][l] */ double * MAP_ELEM_C(int k, int l) { int bi=k/bsize, bj=l/bsize; int blk_i=k%bsize, blk_j=l%bsize; return MAP_BLK_C(bi, bj)+blk_i*bsize+blk_j; } /* the following two functions map a tid to a corresponding (i, j) pair */ int tid2i(int tid) { return tid/bnum; } int tid2j(int tid) { return tid%bnum; } void do_sub_mm(double *c, double *a, double *b) { int i, j, k, t1=0; /* this is a slight better version than sam_mm.c */ /* constant propogation and computation strenght reduction */ /* are performed, just let you have a sense how strange */ /* the code will look like after optimization. */ for (i=0; i= bnum*bnum) return NULL; for (tid=tid_begin; tid .\n", argv[0]); return -1; } n=atoi(argv[1]); bsize=atoi(argv[2]); nproc=atoi(argv[3]); d=atoi(argv[4]); bnum=n/bsize; if (bnum*bsize!=n) { fprintf(stderr, "Invalid parameters .\n", nproc, n, bsize); return -1; } barea=bsize*bsize; A=(double *)malloc(sizeof(double)*(n*n)); B=(double *)malloc(sizeof(double)*(n*n)); C=(double *)malloc(sizeof(double)*(n*n)); threads=(pthread_t *)malloc(sizeof(pthread_t)*(nproc-1)); pthread_mutex_init(&lock, NULL); if (!(A && B && C)) { fprintf(stderr, "Out of Memory.\n"); return -1; } /* zero the memory used by C */ memset((char *)C, 0, sizeof(double)*(n*n)); printf("Initialization ...\n"); /* initialize A and B, this is inefficient but easy to understand */ for (i=0; i