Thank you for sending the input files. Indeed, it seems like the problem is in the MPI_Allreduce counter overflow but what I was suggesting above would not be
sufficient. The eigenvectors array size for your system should be 64*8*256*12000*2, where the last factor 2 is due to the fact that here complex
numbers are communicated between cores as two doubles.
I created a patch that changes the relevant counters to 8 byte integers. Unfortunately, I cannot test it for your system specifically due to its size, so
could you please apply this patch to your 6.2.0 code and try to repeat the calculation?
Code: Select all
diff --git a/src/bse.F b/src/bse.F
index 83de6ab9..682ce859 100644
--- a/src/bse.F
+++ b/src/bse.F
@@ -5926,7 +5926,7 @@ cp: DO N=1,WHF%WDES%COMM_INTER%NCPU
!Gather all parts of the eigenvectors stored on the different nodes
! CALLMPI( M_sum_z(WHF%WDES%COMM, EIGVECT, NBSEEIG*NCV))
- CALLMPI( M_sum_z(WHF%WDES%COMM_INTER, EIGVECT, NBSEEIG*NCV))
+ CALLMPI( M_sum_z8(WHF%WDES%COMM_INTER, EIGVECT, INT(NBSEEIG*NCV,8)))
IF (IU6>=0) CALL WRITE_BSE(WHF,NCV,BSE_INDEX,EIGVECT,R,NBSEEIG)
DEALLOCATE(EIGVECT)
diff --git a/src/mpi.F b/src/mpi.F
index 23e78596..8a63cc65 100644
--- a/src/mpi.F
+++ b/src/mpi.F
@@ -1792,6 +1791,82 @@
END SUBROUTINE M_sumb_d
+!----------------------------------------------------------------------
+!
+! M_sumb_d8: performs a global sum on n doubles in vector vec
+! uses MPI_allreduce which is usually very inefficient
+! faster alternative routines can be found below
+! copy of M_sumb_d with int8 counter
+!
+!----------------------------------------------------------------------
+
+ SUBROUTINE M_sumb_d8(COMM, vec, n )
+#ifdef _OPENACC
+ USE mopenacc_struct_def
+#endif
+ USE mpimy
+ USE string, ONLY: str
+ USE tutor, ONLY: vtutor
+ IMPLICIT NONE
+ INCLUDE "pm.inc"
+
+ TYPE(communic) COMM
+ INTEGER(KIND=8) n,j, ichunk
+ REAL(q) vec(n)
+
+ INTEGER ierror, status(MPI_status_size)
+
+! quick return if possible
+ IF (COMM%NCPU == 1) RETURN
+
+! check whether n is sensible
+ !IF (n==0) THEN
+ ! RETURN
+ !ELSE IF (n<0) THEN
+ ! CALL vtutor%bug("M_sumb_d: invalid vector size n " // str(n), __FILE__, __LINE__)
+ !END IF
+
+ PROFILING_START('m_sumb_d')
+
+#ifdef _OPENACC
+ IF (ACC_IS_PRESENT(vec).AND.ACC_EXEC_ON) THEN
+! invoke CUDA-aware MPI_allreduce
+!$ACC ENTER DATA CREATE(DTMP_m) ASYNC(ACC_ASYNC_Q)
+ DO j = 1, n, NDTMP
+ ichunk = MIN( n-j+1 , NDTMP)
+!$ACC WAIT(ACC_ASYNC_Q)
+!$ACC HOST_DATA USE_DEVICE(vec,DTMP_m)
+ CALL MPI_allreduce( vec(j), DTMP_m(1), ichunk, &
+ MPI_double_precision, MPI_sum, &
+ COMM%MPI_COMM, ierror )
+!$ACC END HOST_DATA
+ CALL __DCOPY__(ichunk , DTMP_m(1), 1 , vec(j) , 1)
+ ENDDO
+!$ACC EXIT DATA DELETE(DTMP_m) ASYNC(ACC_ASYNC_Q)
+ PROFILING_STOP('m_sumb_d')
+ RETURN
+ ENDIF
+#endif
+! there is no inplace global sum in MPI, thus we have to use
+! a work array
+
+ DO j = 1, n, NDTMP
+ ichunk = MIN( n-j+1 , NDTMP)
+
+ CALL MPI_allreduce( vec(j), DTMP_m(1), ichunk, &
+ MPI_double_precision, MPI_sum, &
+ COMM%MPI_COMM, ierror )
+
+ IF ( ierror /= MPI_success ) &
+ CALL vtutor%error('M_sumb_d: MPI_allreduce returns: ' // str(ierror))
+
+ CALL DCOPY(ichunk , DTMP_m(1), 1 , vec(j) , 1)
+ ENDDO
+
+ PROFILING_STOP('m_sumb_d')
+
+ END SUBROUTINE M_sumb_d8
+
!----------------------------------------------------------------------
!
! M_sumb_d: performs a global sum on n singles in vector vec
@@ -4307,6 +4382,109 @@
#endif
END SUBROUTINE M_sumf_d
+
+!----------------------------------------------------------------------
+!
+! M_sumf_d8: performs a fast global sum on n doubles in
+! vector vec (algorithm by Kresse Georg)
+!
+! uses complete interchange algorithm (my own invention, but I guess
+! some people must know it)
+! exchange data between nodes, sum locally and
+! interchange back, this algorithm is faster than typical MPI based
+! algorithms (on 8 nodes under MPICH a factor 4)
+! copy of M_sumf_d with int8 counter
+!
+!----------------------------------------------------------------------
+
+ SUBROUTINE M_sumf_d8(COMM, vec, n)
+ USE mpimy
+ USE string, ONLY: str
+ USE tutor, ONLY: vtutor
+ IMPLICIT NONE
+
+ TYPE(communic) COMM
+ INTEGER(KIND=8) n,ncount,nsummed,ndo, n_,mmax
+ INTEGER(KIND=8) i,j, info
+ REAL(q) vec(n)
+!----------------------------------------------------------------------
+#if defined(T3D_SMA)
+!----------------------------------------------------------------------
+ INTEGER MALLOC_DONE
+ INTEGER, EXTERNAL :: ISHM_CHECK
+ COMMON /SHM/ MALLOC_DONE, PBUF
+ POINTER ( PBUF, vec_inter )
+ REAL(q) :: vec_inter(n/COMM%NCPU*COMM%NCPU)
+ INTEGER(kind=8) :: max_=n/COMM%NCPU
+
+ ! quick return if possible
+ IF (COMM%NCPU == 1) RETURN
+ ! do we have sufficient shm workspace to use fast interchange algorithm
+ ! no use conventional M_sumb_d
+ IF (ISHM_CHECK(n) == 0) THEN
+ CALL M_sumb_d8(COMM, vec, n)
+ RETURN
+ ENDIF
+!----------------------------------------------------------------------
+#else
+!----------------------------------------------------------------------
+ REAL(q), ALLOCATABLE :: vec_inter(:)
+ ! maximum work space for quick sum
+!
+! maximum communication blocks
+! too large blocks are slower on the Pentium architecture
+! probably due to caching
+!
+ INTEGER(kind=8), PARAMETER :: max_=MPI_BLOCK
+
+ ! quick return if possible
+ IF (COMM%NCPU == 1) RETURN
+
+ mmax=MIN(n/COMM%NCPU,max_)
+ ALLOCATE(vec_inter(mmax*COMM%NCPU))
+!----------------------------------------------------------------------
+#endif
+!----------------------------------------------------------------------
+ nsummed=0
+ n_=n/COMM%NCPU
+
+ DO ndo=0,n_-1,mmax
+ ! forward exchange
+ ncount =MIN(mmax,n_-ndo)
+ nsummed=nsummed+ncount*COMM%NCPU
+
+ CALL M_alltoall_d(COMM, ncount*COMM%NCPU, vec(ndo*COMM%NCPU+1), vec_inter(1))
+ ! sum localy
+ DO i=2, COMM%NCPU
+ CALL DAXPY(ncount, 1.0_q, vec_inter(1+(i-1)*ncount), 1, vec_inter(1), 1)
+ ENDDO
+ ! replicate data (will be send to each proc)
+ DO i=1, COMM%NCPU
+ DO j=1,ncount
+ vec(ndo*COMM%NCPU+j+(i-1)*ncount) = vec_inter(j)
+ ENDDO
+ ENDDO
+ ! backward exchange
+ CALL M_alltoall_d(COMM, ncount*COMM%NCPU, vec(ndo*COMM%NCPU+1), vec_inter(1))
+ CALL DCOPY( ncount*COMM%NCPU, vec_inter(1), 1, vec(ndo*COMM%NCPU+1), 1 )
+ ENDDO
+
+ ! that should be it
+ !IF (n_*COMM%NCPU /= nsummed) THEN
+ ! CALL vtutor%bug("M_sumf_d: " // str(n_) // " " // str(nsummed), __FILE__, __LINE__)
+ !ENDIF
+
+ IF (n-nsummed /= 0 ) &
+ CALL M_sumb_d8(COMM, vec(nsummed+1), n-nsummed)
+
+#if defined(T3D_SMA)
+ ! nup nothing to do here
+#else
+ DEALLOCATE(vec_inter)
+#endif
+ END SUBROUTINE M_sumf_d8
+
+
!----------------------------------------------------------------------
!
! M_sumf_s: performs a fast global sum on n singles in
@@ -4426,6 +4604,33 @@
#endif
END SUBROUTINE M_sum_z
+!----------------------------------------------------------------------
+!
+! M_sum_z8: performs a sum on n double complex numbers
+! it uses either sumb_d8 or sumf_d
+! copy of the M_sum_z with int8 counter
+!
+!----------------------------------------------------------------------
+
+ SUBROUTINE M_sum_z8(COMM, vec, n)
+ USE mpimy
+ IMPLICIT NONE
+
+ TYPE(communic) COMM
+ INTEGER(KIND=8) n
+ REAL(q) vec(n)
+
+#ifdef use_collective_sum
+ CALL M_sumb_d8(COMM, vec, 2*n)
+#else
+ IF ( 2*n>MPI_BLOCK) THEN
+ CALL M_sumf_d8(COMM, vec, 2*n)
+ ELSE
+ CALL M_sumb_d8(COMM, vec, 2*n)
+ ENDIF
+#endif
+ END SUBROUTINE M_sum_z8
+
!----------------------------------------------------------------------
!
! M_sum_qd: performs a sum on n quadruple complex numbers