/* Matrix multiply: optimization by blocking */ #include #include #include /* For Solaris */ /*#include */ #ifndef N # define N 1000 #endif #define min(a,b) (((a)<(b))?(a):(b)) #define max(a,b) (((a)>(b))?(a):(b)) static void matmul_orig(double x[N][N], double y[N][N], double z[N][N]) { int i, j, k; double r; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { r = 0; for (k = 0; k < N; k++) { r = r + y[i][k] * z[k][j]; } x[i][j] = r; } } } static void matmul_bloc(double x[N][N], double y[N][N], double z[N][N], int nb) { int i, j, k, jj, kk; double r; for (jj = 0; jj < N; jj += nb) { for (kk = 0; kk < N; kk += nb) { for (i = 0; i < N; i++) { for (j = jj; j < min(jj + nb, N); j++) { r = 0; for (k = kk; k < min(kk + nb, N); k++) { r = r + y[i][k] * z[k][j]; } x[i][j] = x[i][j] + r; } } } } } static double x[N][N], y[N][N], z[N][N]; int main(int argc, char *argv[]) { int i, j, nb; struct rusage RU; double t, c; if (argc != 2) { fprintf(stderr, "Usage: %s NB\n", argv[0]); fprintf(stderr, "(set NB==0 to test the original loop).\n"); exit(1); } nb = atoi(argv[1]); srand(12345); for (i = 0; i < N; i++) for (j = 0; j < N; j++) { y[i][j] = rand()/(double)RAND_MAX; z[i][j] = rand()/(double)RAND_MAX; } getrusage(RUSAGE_SELF, &RU); t = RU.ru_utime.tv_sec + (double)RU.ru_utime.tv_usec * 1e-6; (nb <= 0) ? matmul_orig(x, y, z) : matmul_bloc(x, y, z, nb); getrusage(RUSAGE_SELF, &RU); t = RU.ru_utime.tv_sec + (double)RU.ru_utime.tv_usec * 1e-6 - t; c = 0; for (i = 0; i < N; i++) for (j = 0; j < N; j++) c += x[i][j]; printf("N: %d NB: %d Time: %0.2lf Cksum: %0.2lf Mflops: %0.2lf\n", N, nb, t, c, (2.0 * N * (double)N * (double)N - N * (double)N) / t * 1e-6); return 0; }