*
* This is an answer to the Execise 2(1) on page 20 of mpi973.doc
*
* The program is based on the MPI_Isend/MPI_Irecv functions to avoid
* dead locks. Moreover, it allows overlapping computations and
* communications on some systems.
*
      program matmul
      implicit double precision (a-h, o-z)
      include 'mpif.h'
      parameter (nbuffer=450*1024*1024/8)
      dimension buf(nbuffer)
*
      call MPI_Init(ierr)
      call MPI_Comm_rank(MPI_COMM_WORLD, myrank, ierr)
      call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)
*
      if (myrank.eq.0) then
         print *, 'Enter M, N, L: '
         call flush(6)
         read(*,*) M, N, L
      endif
      call MPI_Bcast(M, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
      call MPI_Bcast(N, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
      call MPI_Bcast(L, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
*
      if ( mod(m,nprocs).ne.0 .or. mod(l,nprocs).ne.0 ) then
         if (myrank.eq.0) print *, 'M or L cannot be divided by nprocs!'
         call MPI_Finalize(ierr)
         stop
      endif
*
      ia   = 1
      ib   = ia  + m/nprocs   * n
      ic   = ib  + n          * l/nprocs
      iwk  = ic  + m/nprocs   * l
      iend = iwk + n          * l/nprocs
      if ( iend .gt. nbuffer+1 ) then
         if (myrank.eq.0) print *, 'Insufficient buffer size!'
         call MPI_Finalize(ierr)
         stop
      endif
*
      call sub( m, n, l, myrank, nprocs, buf(ia), buf(ib), buf(ic),
     &          buf(iwk) )
*
      call MPI_Finalize(ierr)
      stop
      end
*
*------------------------------------------------------------------
*
      subroutine sub(m, n, l, myrank, nprocs, a, b, c, work)
      implicit double precision (a-h, o-z)
      include 'mpif.h'
      dimension a(m/nprocs, n), b(n, l/nprocs), c(m/nprocs, l),
     &          work(n, l/nprocs)
      integer src, dest, tag, local_code, code
      integer status(MPI_STATUS_SIZE, 2), request(2)
      double precision time_start, time_end
*
      mloc = m/nprocs
      lloc = l/nprocs
*
      dest = mod( myrank-1+nprocs, nprocs )
      src  = mod( myrank+1,        nprocs )
*
* Assign values to a, b
      do j=1, n
         do i=1, mloc
            a(i,j) = i+myrank*mloc
         enddo
      enddo
*
      do j=1, lloc
         do i=1, n
            b(i,j) = j+myrank*lloc
         enddo
      enddo
*
      time_start = MPI_Wtime()
*
      jpos=myrank*lloc
*
      do ip=1, nprocs - 1
*
         tag = 10000 + ip
         call MPI_Isend( b, n*lloc, MPI_DOUBLE_PRECISION, dest, tag,
     &                   MPI_COMM_WORLD, request(1), ierr )
         call MPI_Irecv( work, n*lloc, MPI_DOUBLE_PRECISION, src, tag,
     &                   MPI_COMM_WORLD, request(2), ierr )
*
         do j=1, lloc
            do i=1, mloc
               sum=0.d0
               do k=1, n
                  sum = sum + a(i,k) * b(k,j)
               enddo
               c(i, j+jpos) = sum
            enddo
         enddo
*
         call MPI_Waitall(2, request, status, ierr)
*
         do j=1, lloc
            do i=1, n
               b(i,j) = work(i,j)
            enddo
         enddo
*
         jpos = jpos + lloc
         if ( jpos .ge. l ) jpos = 0
*
      enddo
*
      do j=1, lloc
         do i=1, mloc
            sum=0.d0
            do k=1, n
               sum = sum + a(i,k) * b(k,j)
            enddo
            c(i, j+jpos) = sum
         enddo
      enddo
*
      time_end = MPI_Wtime()
*
* Check results
      local_code = 0
      do j=1, l
         do i=1, mloc
            if ( abs(c(i,j) - n*dble(i+myrank*mloc)*j) .gt. 1d-10 ) then
               local_code = 1
               goto 10
            endif
         enddo
      enddo
*
10    call MPI_Reduce( local_code, code, 1, MPI_INTEGER, MPI_SUM, 0,
     &                 MPI_COMM_WORLD, ierr)
*
      if ( myrank .eq. 0 ) then
         print *, 'code   = ', code
         print *, 'time   = ', time_end-time_start
         print *, 'mflops = ', m*(n+n-1.d0)*l/(time_end-time_start)*1d-6
      endif
*
      return
      end