My bad. The M_sum_d8 subroutine has not been merged into the release version of the code. Your implementation of M_sum_d8 calls M_sumf_d, which takes 4-byte counter, so it will probably hit the overflow in M_sumf_d instead of M_sum_d.
I added the necessary routines. Could you please try your calculation with the updated patch?
Code: Select all
diff -Naur vasp.6.3.0/src/mpi.F ./src/mpi.F
--- vasp.6.3.0/src/mpi.F 2022-01-20 16:10:06.000000000 +0100
+++ ./src/mpi.F 2023-01-04 13:55:47.113559660 +0100
@@ -4378,6 +4378,137 @@
!----------------------------------------------------------------------
!
+! M_sumf_d8: performs a fast global sum on n doubles in
+! vector vec (algorithm by Kresse Georg); this a special version
+! for giant arrays exceeding an element count n of 2^31-1 (the
+! maximum size that can be handled with 32-bit INTEGER). Mainly,
+! the major difference to routine M_sumf_d is that the last
+! argument ("n") is now declared as 64-bit INTEGER -- and that
+! in addition some intermediate integer operation need also to
+! be done with 64-bit INTEGER temporary variables (adapted by jF)
+!
+!----------------------------------------------------------------------
+
+ SUBROUTINE M_sumf_d8(COMM, vec, n)
+ USE mpimy
+ USE string, ONLY: str
+ USE tutor, ONLY: vtutor
+ IMPLICIT NONE
+
+ TYPE(communic) COMM
+! this must all declared as 64-bit INTEGER
+ INTEGER(qi8) n,nsummed,ndo,i,j,n_,mmax
+! this must stay to be 32-bit INTEGER and a new "n4" is needed for MPI calls
+ INTEGER ncount,info,n4
+ REAL(q) vec(n)
+!----------------------------------------------------------------------
+#if defined(T3D_SMA)
+!----------------------------------------------------------------------
+ INTEGER MALLOC_DONE
+ INTEGER, EXTERNAL :: ISHM_CHECK
+ COMMON /SHM/ MALLOC_DONE, PBUF
+ POINTER ( PBUF, vec_inter )
+ REAL(q) :: vec_inter(n/COMM%NCPU*COMM%NCPU)
+ INTEGER :: max_
+
+ ! quick return if possible
+ IF (COMM%NCPU == 1) RETURN
+! NO real special handling here (just restore the necessary standard INTEGER);
+! but I doubt that there is still ANY T3D alive (and if memory is small)
+ n4=n
+ max_=n4/COMM%NCPU
+ ! do we have sufficient shm workspace to use fast interchange algorithm
+ ! no use conventional M_sumb_d
+ IF (ISHM_CHECK(n4) == 0) THEN
+ CALL M_sumb_d(COMM, vec, n4)
+ RETURN
+ ENDIF
+!----------------------------------------------------------------------
+#else
+!----------------------------------------------------------------------
+ REAL(q), ALLOCATABLE :: vec_inter(:)
+ ! maximum work space for quick sum
+!
+! maximum communication blocks
+! too large blocks are slower on the Pentium architecture
+! probably due to caching
+!
+ INTEGER, PARAMETER :: max_=MPI_BLOCK
+
+ ! quick return if possible
+ IF (COMM%NCPU == 1) RETURN
+
+ IF (n/COMM%NCPU < 2147483648_qi8) THEN
+! here the case that n/COMM%CPU still fits into the 32-bit INTEGER range ...
+ n4=n/COMM%NCPU
+ mmax=MIN(n4,max_)
+ ELSE
+! ... if not: (since max_ is of type 32-bit INTEGER) max_ *must* be the minimum
+ mmax=max_
+ ENDIF
+ ALLOCATE(vec_inter(mmax*COMM%NCPU))
+!----------------------------------------------------------------------
+#endif
+!----------------------------------------------------------------------
+
+ nsummed=0_qi8
+ n_=n/COMM%NCPU
+
+ DO ndo=0_qi8,n_-1_qi8,mmax
+ ! forward exchange
+ ! since the maximum value of mmax is finally determined by max_ (MPI_BLOCK)
+ ! which is a 32-bit INTEGER (and hence can never violate the allowed range
+ ! of 32-bit INTEGER numbers) I could safely assume that "ncount" may be of
+ ! type "INTEGER" -- this was essential since DAXPY and DCOPY as well as
+ ! all MPI calls require 32-bit INTEGER arguments (for a standard BLAS);
+ ! be warned that the situation might change if COMM%NCPU goes into the
+ ! millions (since actually not ncount alone but ncount*COMM%NCPU is handed
+ ! over as argument!) but then not only here but in ALL other subroutines we
+ ! would run into trouble -- or needed BLAS/MPI routines that can accept
+ ! 64-bit INTEGERS (still rare but already existing ...); at the moment I
+ ! could just recommend to set a small enough value for max_ (MPI_BLOCK)
+ ! which must be simply smaller than INT(2147483647/NCPU) ...
+ ncount =MIN(mmax,n_-ndo)
+ nsummed=nsummed+ncount*COMM%NCPU
+
+ CALL M_alltoall_d(COMM, ncount*COMM%NCPU, vec(ndo*COMM%NCPU+1), vec_inter(1))
+ ! sum localy
+ DO i=2, COMM%NCPU
+ CALL DAXPY(ncount, 1.0_q, vec_inter(1+(i-1)*ncount), 1, vec_inter(1), 1)
+ ENDDO
+ ! replicate data (will be send to each proc)
+ DO i=1, COMM%NCPU
+ DO j=1,ncount
+ vec(ndo*COMM%NCPU+j+(i-1)*ncount) = vec_inter(j)
+ ENDDO
+ ENDDO
+ ! backward exchange
+ CALL M_alltoall_d(COMM, ncount*COMM%NCPU, vec(ndo*COMM%NCPU+1), vec_inter(1))
+ CALL DCOPY( ncount*COMM%NCPU, vec_inter(1), 1, vec(ndo*COMM%NCPU+1), 1 )
+ ENDDO
+
+ ! that should be it
+ IF (n_*COMM%NCPU /= nsummed) THEN
+ CALL vtutor%bug("M_sumf_d8: " // str(n_) // " " // str(nsummed), __FILE__, __LINE__)
+ ENDIF
+
+! Now the remaining few elements (we can use M_sumb_d for this task since the
+! number of elements left cannot be larger than "max_" [being a 32-bit INTEGER])
+ IF (n-nsummed /= 0_qi8 ) THEN
+ n4=n-nsummed
+ CALL M_sumb_d(COMM, vec(nsummed+1), n4)
+ ENDIF
+
+#if defined(T3D_SMA)
+ ! nup nothing to do here
+#else
+ DEALLOCATE(vec_inter)
+#endif
+ END SUBROUTINE M_sumf_d8
+
+
+!----------------------------------------------------------------------
+!
! M_sumf_s: performs a fast global sum on n singles in
! vector vec (algorithm by Kresse Georg)
!
@@ -4577,6 +4708,60 @@
!----------------------------------------------------------------------
!
+! M_sum_d8: performs a sum on n double precision numbers
+! this is for giant arrays with giant dimensions (parameter n potentially
+! exceeding the range of 32-bit INTEGER numbers); this mainly uses special
+! version M_sumf_d_giant of routine M_sumf_d (use of M_sumb_d not always
+! possible for "use_collective_sum" -- only second call always possible)
+!
+!----------------------------------------------------------------------
+
+ SUBROUTINE M_sum_d8(COMM, vec, n)
+ USE mpimy
+ IMPLICIT NONE
+
+ TYPE(communic) COMM
+ INTEGER(qi8) n
+ REAL(q) vec(n)
+! we need additional local variables
+ INTEGER(qi8) max_
+ INTEGER n4
+
+! return ranks that are not in the group
+ IF ( COMM%MPI_COMM == MPI_COMM_NULL ) THEN
+ RETURN
+ ENDIF
+
+#ifdef use_collective_sum
+! this is now a bit more complicated than in routine M_sum_d ...
+ max_=2147483647_qi8 ! 2^31-1 = maximum possible 32-bit integer
+ IF ( n>max_) THEN
+! here we have no choice; we cannot use M_sumb_d (MPI limitations to 32-bit
+! integers!) and therefore we have to use M_sumf_d8 instead ... !!
+ CALL M_sumf_d8(COMM, vec, n)
+ ELSE
+! only "n" smaller than 2^31 (limit for 32-bit integers) allows use of M_sumb_d
+ n4=n
+ CALL M_sumb_d(COMM, vec, n4)
+ ENDIF
+#else
+! ... and also here we need to take a bit care ...
+ max_=MPI_BLOCK
+ IF (n>max_) THEN
+! ... for safety we also have to call the "8" version here ...
+ CALL M_sumf_d8(COMM, vec, n)
+ ELSE
+! ... and since MPI_BLOCK (which is a 32-bit INTEGER) is the possible maximum
+! of n inside this ELSE branch we can keep the call to M_sumb_d here ...
+ n4=n
+ CALL M_sumb_d(COMM, vec, n4)
+ ENDIF
+#endif
+ END SUBROUTINE M_sum_d8
+
+
+!----------------------------------------------------------------------
+!
! M_sum_s: performs a sum on n single prec numbers
! it uses either sumb_s or sumf_s
!
diff -Naur ../original/vasp.6.3.0/src/sphpro.F ./src/sphpro.F
--- ../original/vasp.6.3.0/src/sphpro.F 2022-01-20 16:10:07.000000000 +0100
+++ ./src/sphpro.F 2023-01-04 13:59:43.629249207 +0100
@@ -1461,12 +1461,12 @@
NIS = NIS+T_INFO%NITYP(NT)
ENDDO typ
- CALLMPI( M_sum_d( WDES%COMM, ION_SUM, LDIMP*T_INFO%NIONS*WDES%NCDIJ))
- CALLMPI( M_sum_d( WDES%COMM, ION_SUM_DETAIL, LMDIMP*T_INFO%NIONS*WDES%NCDIJ))
+ CALLMPI( M_sum_d8( WDES%COMM, ION_SUM, 1_qi8*LDIMP*T_INFO%NIONS*WDES%NCDIJ))
+ CALLMPI( M_sum_d8( WDES%COMM, ION_SUM_DETAIL, 1_qi8*LMDIMP*T_INFO%NIONS*WDES%NCDIJ))
IF (LFINAL) THEN
- CALLMPI( M_sum_d( WDES%COMM, PAR(1,1,1,1,1), WDES%NB_TOT*WDES%NKPTS*LPAR*T_INFO%NIONP*WDES%NCDIJ))
+ CALLMPI( M_sum_d8( WDES%COMM, PAR(1,1,1,1,1), 1_qi8*WDES%NB_TOT*WDES%NKPTS*LPAR*T_INFO%NIONP*WDES%NCDIJ))
IF (LORBIT>=12 .AND. LORBIT<=14) THEN
- CALLMPI( M_sum_d( WDES%COMM, PHAS(1,1,1,1,1), LMDIMP*T_INFO%NIONS*WDES%NKPTS*WDES%NB_TOT*WDES%ISPIN*2))
+ CALLMPI( M_sum_d8( WDES%COMM, PHAS(1,1,1,1,1), 1_qi8*LMDIMP*T_INFO%NIONS*WDES%NKPTS*WDES%NB_TOT*WDES%ISPIN*2))
ENDIF
ENDIF
diff -Naur ../original/vasp.6.3.0/src/string.F ./src/string.F
--- ../original/vasp.6.3.0/src/string.F 2022-01-20 16:10:07.000000000 +0100
+++ ./src/string.F 2023-01-04 13:59:46.819231533 +0100
@@ -2,7 +2,8 @@
module string
- use prec, only: q,qd
+!jF: also "qi8" (64-bit integer data type) is needed for my modifications below
+ use prec, only: q,qd,qi8
USE base, ONLY: TOREAL
implicit none
@@ -30,9 +31,10 @@
!! is used instead of the given format whenever the format would result in
!! an error or an overflow.
!!
+!jF: added here "integer8Format" for handling 64-bit integers ...
interface str
module procedure logicalDefault, logicalCompact, logicalFormat, &
- integerFormat, integerArray, realDefault, realFormat, &
+ integerFormat, integer8Format, integerArray, realDefault, realFormat, &
realArray, real2dArray, complexDefault, complexFormat, &
complexArray, complex2dArray
#if !defined(noQuadPrecision) || defined(qd_emulate)
@@ -296,6 +298,21 @@
res = trim(tmp)
end function integerFormat
+ !jF: added here "integer8Format" for handling 64-bit integer numbers ...
+ pure function integer8Format(ivar8, formatString) result (res)
+ integer(qi8), intent(in) :: ivar8
+ character(len=*), intent(in), optional :: formatString
+ character(len=*), parameter :: defaultFormat = '(i0)'
+ character(len=:), allocatable :: res
+ character(len=maxLength) tmp
+ integer ierr
+ if (present(formatString)) &
+ write(tmp, formatString, iostat=ierr) ivar8
+ if (.not.present(formatString).or.ierr /= 0.or.tmp(1:1) == '*') &
+ write(tmp, defaultFormat) ivar8
+ res = trim(tmp)
+ end function integer8Format
+
pure function integerArray(iarr, formatString) result (res)
integer, intent(in) :: iarr(:)
character(len=*), intent(in), optional :: formatString