From 51cd620f459251e583119e03106100ec7ef9d9b5 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 24 Sep 2021 13:54:46 -0400 Subject: [PATCH 01/16] Created Makefiles tailored to work well for the Intel Advisor profiler --- Makefile | 20 +++++++++++--------- Makefile.Defines | 12 +++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 34df2b87a..4ee3be240 100644 --- a/Makefile +++ b/Makefile @@ -62,14 +62,16 @@ include Makefile.Defines MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul +IADVIXE = -I$(ADVISOR_2019_DIR)/include/ia64 +LADVIXE = -L$(ADVISOR_2019_DIR)/lib64 MODULES = $(SWIFTEST_MODULES) $(USER_MODULES) .PHONY : all mod fast strict drivers bin clean force % : %.f90 force - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $< -o $@ \ - -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) $< -o $@ \ + -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) $(LADVIXE) $(INSTALL_PROGRAM) $@ $(SWIFTEST_HOME)/bin rm -f $@ @@ -82,7 +84,7 @@ all: mod: cd $(SWIFTEST_HOME)/src/modules/; \ - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c $(MODULES); \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c $(MODULES); \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o; \ $(INSTALL_DATA) *.mod *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.mod *.smod @@ -186,34 +188,34 @@ strict: ln -s $(SWIFTEST_HOME)/Makefile .; \ make strictdir cd $(SWIFTEST_HOME)/src/helio; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c helio_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c helio_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/symba; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c symba_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c symba_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/rmvs; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c rmvs_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c rmvs_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/whm; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c whm_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c whm_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod fastdir: - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod strictdir: - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smo diff --git a/Makefile.Defines b/Makefile.Defines index 278388cb0..54fdbe3f7 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -45,10 +45,7 @@ COLLRESOLVE_HOME = $(ROOT_DIR)/collresolve/ # DO NOT include in FFLAGS the "-c" option to compile object only # this is done explicitly as needed in the Makefile -ADVIXE_DIR = /apps/cent7/intel/advisor_2019 -ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vecabi=cmdtarget -simd -shared-intel -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -xhost -traceback - -VTUNE_FLAGS = -g -O2 -qopt-report=5 -simd -shared-intel -qopenmp -debug inline-debug-info -parallel-source-info=2 -parallel -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -qopenmp -fp-model no-except -mp1 -xhost -traceback +ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vec -shared-intel -ldl -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -traceback #Be sure to set the environment variable KMP_FORKJOIN_FRAMES=1 for OpenMP debuging in vtune IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin @@ -69,14 +66,15 @@ GPRODUCTION = -O2 -ffree-line-length-none $(GPAR) #FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) #FFASTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) -FSTRICTFLAGS = $(IPRODUCTION) $(STRICTREAL) $(OPTREPORT) #$(ADVIXE_FLAGS) -FFLAGS = $(IPRODUCTION) -fp-model=fast $(OPTREPORT) #$(ADVIXE_FLAGS) +FSTRICTFLAGS = $(ADVIXE_FLAGS) $(STRICTREAL) $(SIMDVEC) $(PAR) +FFLAGS = $(ADVIXE_FLAGS) -fp-model=fast $(SIMDVEC) $(PAR) FORTRAN = ifort AR = xiar #FORTRAN = gfortran #FFLAGS = $(GDEBUG) # $(GMEM) $(GPAR) -#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only +#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall +#AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only # this is done explicitly as needed in the Makefile CC = icc From a703880e054c344c52c66b3af46f5c4f3e8b7ef2 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Wed, 20 Oct 2021 13:25:56 -0400 Subject: [PATCH 02/16] Streamlined compiler arguments --- Makefile.Defines | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/Makefile.Defines b/Makefile.Defines index 3aaa18138..a763c316f 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -48,39 +48,20 @@ COLLRESOLVE_HOME = $(ROOT_DIR)/collresolve/ ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vec -shared-intel -ldl -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -traceback #Be sure to set the environment variable KMP_FORKJOIN_FRAMES=1 for OpenMP debuging in vtune -IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin STRICTREAL = -fp-model=precise -prec-div -prec-sqrt -assume protect-parens SIMDVEC = -simd -xhost -align all -assume contiguous_assumed_shape -vecabi=cmdtarget -fp-model no-except -fma PAR = -qopenmp -parallel -HEAPARR = -heap-arrays 4194304 -OPTREPORT = -qopt-report=5 -IPRODUCTION = -no-wrap-margin -O3 -qopt-prefetch=0 -sox $(PAR) $(SIMDVEC) #$(HEAPARR) - -#gfortran flags -GDEBUG = -g -Og -fbacktrace -fbounds-check -ffree-line-length-none -GPAR = -fopenmp -ftree-parallelize-loops=4 -GMEM = -fsanitize-address-use-after-scope -fstack-check -fsanitize=bounds-strict -fsanitize=undefined -fsanitize=signed-integer-overflow -fsanitize=object-size -fstack-protector-all -GWARNINGS = -Wall -Warray-bounds -Wimplicit-interface -Wextra -Warray-temporaries -GPRODUCTION = -O3 -ffree-line-length-none $(GPAR) MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul -#FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) -#FFASTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) + FSTRICTFLAGS = $(ADVIXE_FLAGS) $(STRICTREAL) $(SIMDVEC) $(PAR) FFLAGS = $(ADVIXE_FLAGS) -fp-model=fast $(SIMDVEC) $(PAR) FORTRAN = ifort AR = xiar -#FORTRAN = gfortran -#FFLAGS = $(GDEBUG) # $(GMEM) $(GPAR) -#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall -#AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only -# this is done explicitly as needed in the Makefile - -#CC = icc -CC = cc +CC = icc CFLAGS = -O3 -w -m64 -std=c99 64_BIT_REALS = -r8 From c495f5823f2f0a0f198d6960feab62bf651b6321 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Thu, 21 Oct 2021 10:45:25 -0400 Subject: [PATCH 03/16] Restored production compiler flags --- Makefile.Defines | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Makefile.Defines b/Makefile.Defines index 443284a86..c2d0258d5 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -66,13 +66,12 @@ GMEM = -fsanitize-address-use-after-scope -fstack-check -fsanitize=bounds-stri GWARNINGS = -Wall -Warray-bounds -Wimplicit-interface -Wextra -Warray-temporaries GPRODUCTION = -O3 -ffree-line-length-none $(GPAR) - MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul -FSTRICTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) -FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) +FSTRICTFLAGS = $(IPRODUCTION) $(STRICTREAL) +FFLAGS = $(IPRODUCTION) -fp-model=fast FORTRAN = ifort AR = xiar CC = icc From 5d9859d6ef8be7a7d8e0c0523a1594a7b2aaf95b Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 03:00:30 -0400 Subject: [PATCH 04/16] Added timing instrumentation to encounter checks --- src/encounter/encounter_check.f90 | 48 +++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index c64dbd68a..f32cd1db7 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -377,6 +377,8 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, npl_last = npl end if + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(x, v, renc, boundingbox) & !$omp firstprivate(dt, npl, n) @@ -392,17 +394,31 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, x(dim,1:npl) + renc(1:npl) + vshift_max(1:npl) * v(dim,1:npl) * dt]) end do !$omp end parallel do + call timer%stop() + write(*,*) "plpl sort : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call boundingbox%sweep(npl, nenc, index1, index2) + call timer%stop() + write(*,*) "plpl sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Now that we have identified potential pairs, use the narrow-phase process to get the final values allocate(lencounter(nenc)) allocate(lvdotr(nenc)) + call timer%reset() + call timer%start() call encounter_check_all(nenc, index1, index2, x, v, x, v, renc, renc, dt, lencounter, lvdotr) + call timer%stop() + write(*,*) "plpl check : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_reduce_broadphase(npl, nenc, index1, index2, lencounter, lvdotr) + call timer%stop() + write(*,*) "plpl reduce: ",timer%count_stop_step - timer%count_start_step deallocate(lencounter) end if @@ -447,9 +463,7 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt ntot = nplm + nplt n = 2 * ntot if (ntot /= ntot_last) then - call boundingbox%setup(ntot, ntot_last) - ntot_last = ntot end if @@ -479,8 +493,14 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt xplt(dim,1:nplt) + renct(1:nplt) + vpltshift_max(1:nplt) * vplt(dim,1:nplt) * dt]) end do !$omp end parallel do + call timer%stop() + write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call boundingbox%sweep(nplm, nplt, nenc, index1, index2) + call timer%stop() + write(*,*) "plplm sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Shift tiny body indices back into the range of the input position and velocity arrays @@ -490,12 +510,20 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt allocate(lencounter(nenc)) allocate(lvdotr(nenc)) + call timer%reset() + call timer%start() call encounter_check_all(nenc, index1, index2, xplm, vplm, xplt, vplt, rencm, renct, dt, lencounter, lvdotr) + call timer%stop() + write(*,*) "plplm check : ",timer%count_stop_step - timer%count_start_step ! Shift the tiny body indices back to their natural range index2(:) = index2(:) + nplm + call timer%reset() + call timer%start() call encounter_check_reduce_broadphase(ntot, nenc, index1, index2, lencounter, lvdotr) + call timer%stop() + write(*,*) "plplm reduce: ",timer%count_stop_step - timer%count_start_step end if return end subroutine encounter_check_all_sort_and_sweep_plplm @@ -656,6 +684,8 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call util_index_array(ind_arr, npl) + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(x, v, renc, lenc, ind_arr) & !$omp firstprivate(npl, dt) @@ -667,8 +697,14 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde renc(i), renc(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do + call timer%stop() + write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) + call timer%stop() + write(*,*) "plpl tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plpl @@ -704,6 +740,8 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call util_index_array(ind_arr, nplt) + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(xplm, vplm, xplt, vplt, rencm, renct, lenc, ind_arr) & !$omp firstprivate(nplm, nplt, dt) @@ -715,8 +753,14 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp rencm(i), renct(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do + call timer%stop() + write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) + call timer%stop() + write(*,*) "plplm tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plplm From 5ce265567e55dea659a01e4b179ca00c1f16cf9d Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 03:06:18 -0400 Subject: [PATCH 05/16] Added missing timer reset --- src/encounter/encounter_check.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index f32cd1db7..c9602f995 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -467,6 +467,8 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt ntot_last = ntot end if + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(xplm, xplt, vplm, vplt, rencm, renct, boundingbox) & !$omp firstprivate(dt, nplm, nplt, ntot) From ed2bdafdba8e0184c9e1cb5eeec9756850110ec2 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 16:34:21 -0400 Subject: [PATCH 06/16] Swapped insertion sort for quicksort for better performance in the sort/sweep algorithm --- src/encounter/encounter_check.f90 | 155 ++++++++++++++++-------------- src/util/util_sort.f90 | 85 +++++++++++++--- 2 files changed, 155 insertions(+), 85 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index c9602f995..93580eb33 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -497,7 +497,7 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt !$omp end parallel do call timer%stop() write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step - + call timer%reset() call timer%start() call boundingbox%sweep(nplm, nplt, nenc, index1, index2) @@ -702,11 +702,7 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call timer%stop() write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) - call timer%stop() - write(*,*) "plpl tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plpl @@ -758,11 +754,7 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call timer%stop() write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) - call timer%stop() - write(*,*) "plplm tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plplm @@ -975,16 +967,22 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind integer(I4B), dimension(:), allocatable, intent(out) :: index1 !! List of indices for body 1 in each encounter candidate pair integer(I4B), dimension(:), allocatable, intent(out) :: index2 !! List of indices for body 2 in each encounter candidate pair !Internals - integer(I4B) :: i, k, ntot + integer(I4B) :: i, k, ntot, dim type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr + integer(I4B), dimension(:), allocatable :: ibeg, iend ntot = n1 + n2 call util_index_array(ind_arr, ntot) + allocate(ibeg(SWEEPDIM * ntot)) + allocate(iend(SWEEPDIM * ntot)) + do dim = 1, SWEEPDIM + ibeg((dim - 1) * ntot + 1:dim * ntot) = self%aabb(dim)%ibeg(:) + iend((dim - 1) * ntot + 1:dim * ntot) = self%aabb(dim)%iend(:) + end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - - call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), self%aabb(1)%ibeg(:), self%aabb(1)%iend(:), self%aabb(2)%ibeg(:), self%aabb(2)%iend(:), ind_arr(:), lenc(:)) + call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) @@ -1012,16 +1010,23 @@ module subroutine encounter_check_sweep_aabb_single_list(self, n, nenc, index1, integer(I4B), dimension(:), allocatable, intent(out) :: index1 !! List of indices for body 1 in each encounter candidate pair integer(I4B), dimension(:), allocatable, intent(out) :: index2 !! List of indices for body 2 in each encounter candidate pair !Internals - Integer(I4B) :: i, k + Integer(I4B) :: i, k, dim type(encounter_list), dimension(n) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr type(walltimer) :: timer + integer(I4B), dimension(:), allocatable :: ibeg, iend call util_index_array(ind_arr, n) + allocate(ibeg(SWEEPDIM * n)) + allocate(iend(SWEEPDIM * n)) + do dim = 1, SWEEPDIM + ibeg((dim - 1) * n + 1:dim * n) = self%aabb(dim)%ibeg(:) + iend((dim - 1) * n + 1:dim * n) = self%aabb(dim)%iend(:) + end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - call encounter_check_sweep_aabb_all_single_list(n, self%aabb(1)%ind(:), self%aabb(1)%ibeg(:), self%aabb(1)%iend(:), self%aabb(2)%ibeg(:), self%aabb(2)%iend(:), ind_arr(:), lenc(:)) + call encounter_check_sweep_aabb_all_single_list(n, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, n]), reshape(iend(:), [SWEEPDIM, n]), ind_arr(:), lenc(:)) call encounter_check_collapse_ragged_list(lenc, n, nenc, index1, index2) @@ -1036,34 +1041,34 @@ module subroutine encounter_check_sweep_aabb_single_list(self, n, nenc, index1, end subroutine encounter_check_sweep_aabb_single_list - subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) + subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibeg, iend, ind_arr, lenc) !! author: David A. Minton !! !! Performs the loop part of the sweep operation. Double list version (e.g. pl-tp or plm-plt) implicit none ! Arguments - integer(I4B), intent(in) :: n1, n2 !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes - type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body + integer(I4B), intent(in) :: n1, n2 !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions + integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes + type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body ! Internals integer(I4B) :: i, ntot logical, dimension(n1+n2) :: lencounteri - integer(I4B) :: ibegxi, iendxi, ibegyi, iendyi + integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi ntot = n1 + n2 !$omp parallel do default(private) schedule(guided)& - !$omp shared(ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) & - !$omp firstprivate(ntot, n1, n2) + !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & + !$omp firstprivate(ntot, n1, n2) & + !$omp lastprivate(ibegi, iendi) do i = 1, ntot - ibegxi = ibegx(i) + 1 - iendxi = iendx(i) - 1 - if (iendxi >= ibegxi) then - ibegyi = ibegy(i) - iendyi = iendy(i) - call encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind(:), ibegxi, iendxi, ibegyi, iendyi, ibegx(:), iendx(:), ibegy(:), iendy(:), lencounteri(:)) + ibegi(1) = ibeg(1,i) + 1 + iendi(1) = iend(1,i) - 1 + if (iendi(1) >= ibegi(1)) then + ibegi(2:SWEEPDIM) = ibeg(2:SWEEPDIM,i) + iendi(2:SWEEPDIM) = iend(2:SWEEPDIM,i) + call encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind(:), ibegi(:), iendi(:), ibeg(:,:), iend(:,:), lencounteri(:)) call encounter_check_make_ragged_list(lencounteri(:), ind_arr(:), lenc(i)%nenc, lenc(i)%index2) else lenc(i)%nenc = 0 @@ -1075,33 +1080,33 @@ subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibegx, ie end subroutine encounter_check_sweep_aabb_all_double_list - subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) + subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, ind_arr, lenc) !! author: David A. Minton !! !! Performs the loop part of the sweep operation. Single list version (e.g. pl-pl) implicit none ! Arguments - integer(I4B), intent(in) :: n !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes - type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body + integer(I4B), intent(in) :: n !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimension + integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes + type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body ! Internals integer(I4B) :: i logical, dimension(n) :: lencounteri - integer(I4B) :: ibegxi, iendxi, ibegyi, iendyi + integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi !$omp parallel do default(private) schedule(guided)& - !$omp shared(ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) & - !$omp firstprivate(n) + !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & + !$omp firstprivate(n) & + !$omp lastprivate(ibegi, iendi) do i = 1, n - ibegxi = ibegx(i) + 1 - iendxi = iendx(i) - 1 - if (iendxi >= ibegxi) then - ibegyi = ibegy(i) - iendyi = iendy(i) - call encounter_check_sweep_aabb_one_single_list(n, ext_ind(:), ibegxi, iendxi, ibegyi, iendyi, ibegx(:), iendx(:), ibegy(:), iendy(:), lencounteri(:)) + ibegi(1) = ibeg(1,i) + 1 + iendi(1) = iend(1,i) - 1 + if (iendi(1) >= ibegi(1)) then + ibegi(2:SWEEPDIM) = ibeg(2:SWEEPDIM,i) + iendi(2:SWEEPDIM) = iend(2:SWEEPDIM,i) + call encounter_check_sweep_aabb_one_single_list(n, ext_ind(:), ibegi(:), iendi(:), ibeg(:,:), iend(:,:), lencounteri(:)) call encounter_check_make_ragged_list(lencounteri(:), ind_arr(:), lenc(i)%nenc, lenc(i)%index2) else lenc(i)%nenc = 0 @@ -1113,60 +1118,66 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibegx, iendx, end subroutine encounter_check_sweep_aabb_all_single_list - pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegxi, iendxi, ibegyi, iendyi, ibegx, iendx, ibegy, iendy, lencounteri) + pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Encounters from the same lists not allowed (e.g. pl-tp encounters only) implicit none ! Arguments - integer(I4B), intent(in) :: i !! The current index of the ith body - integer(I4B), intent(in) :: n1 !! Number of bodies 1 - integer(I4B), intent(in) :: n2 !! Number of bodies 2 - integer(I4B), intent(in) :: ntot !! n1 + n2 - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), intent(in) :: ibegxi, iendxi !! The beginning and ending indices of the ith bounding box in the x-dimension - integer(I4B), intent(in) :: ibegyi, iendyi !! The beginning and ending indices of the ith bounding box in the y-dimension - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimensio - logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body + integer(I4B), intent(in) :: i !! The current index of the ith body + integer(I4B), intent(in) :: n1 !! Number of bodies 1 + integer(I4B), intent(in) :: n2 !! Number of bodies 2 + integer(I4B), intent(in) :: ntot !! n1 + n2 + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:), intent(in) :: ibegi, iendi !! The beginning and ending indices of the ith bounding box in the n-dimensions + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions + logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox + integer(I4B) :: j, jbox, dim + logical, dimension(SWEEPDIM) :: lenc lencounteri(:) = .false. - do concurrent(jbox = ibegxi:iendxi) ! Sweep forward until the end of the interval + lenc(1) = .true. + do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval j = ext_ind(jbox) if (j > ntot) j = j - ntot ! If this is an endpoint index, shift it to the correct range if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed ! Check the y-dimension - lencounteri(j) = (iendy(j) > ibegyi) .and. (ibegy(j) < iendyi) + do dim = 2, SWEEPDIM + lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) + end do + lencounteri(j) = all(lenc(:)) end do return end subroutine encounter_check_sweep_aabb_one_double_list - pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegxi, iendxi, ibegyi, iendyi, ibegx, iendx, ibegy, iendy, lencounteri) + pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Mutual encounters allowed (e.g. pl-pl) implicit none ! Arguments - integer(I4B), intent(in) :: n !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), intent(in) :: ibegxi, iendxi !! The beginning and ending indices of the ith bounding box in the x-dimension - integer(I4B), intent(in) :: ibegyi, iendyi !! The beginning and ending indices of the ith bounding box in the y-dimension - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body + integer(I4B), intent(in) :: n !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:), intent(in) :: ibegi, iendi !! The beginning and ending indices of the ith bounding box in the n-dimensions + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the x-dimension + logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox + integer(I4B) :: j, jbox, dim + logical, dimension(SWEEPDIM) :: lenc lencounteri(:) = .false. - do concurrent(jbox = ibegxi:iendxi) ! Sweep forward until the end of the interval + lenc(1) = .true. + do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval j = ext_ind(jbox) if (j > n) j = j - n ! If this is an endpoint index, shift it to the correct range - ! Check the y-dimension - lencounteri(j) = (iendy(j) > ibegyi) .and. (ibegy(j) < iendyi) + ! Check the other dimensions + do dim = 2, SWEEPDIM + lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) + end do + lencounteri(j) = all(lenc(:)) end do return diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index 8ea7a3026..9cf89faee 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -60,7 +60,7 @@ end subroutine util_sort_body module pure subroutine util_sort_dp(arr) !! author: David A. Minton !! - !! Sort input double precision array in place into ascending numerical order using insertion sort. + !! Sort input DP precision array in place into ascending numerical order using insertion sort. !! This algorithm works well for partially sorted arrays (which is usually the case here) !! implicit none @@ -89,7 +89,7 @@ end subroutine util_sort_dp module pure subroutine util_sort_index_dp(arr, ind) !! author: David A. Minton !! - !! Sort input double precision array by index in ascending numerical order using insertion sort. + !! Sort input DP precision array by index in ascending numerical order using insertion sort. !! This algorithm works well for partially sorted arrays (which is usually the case here). !! If ind is supplied already allocated, we assume it is an existing index array (e.g. a previously !! sorted array). If it is not allocated, this subroutine allocates it. @@ -99,27 +99,86 @@ module pure subroutine util_sort_index_dp(arr, ind) real(DP), dimension(:), intent(in) :: arr integer(I4B), dimension(:), allocatable, intent(inout) :: ind ! Internals - integer(I4B) :: n, i, j, jp, itmp + integer(I4B) :: n, i, j, itmp + real(DP), dimension(:), allocatable :: tmparr n = size(arr) if (.not.allocated(ind)) then allocate(ind(n)) ind = [(i, i=1, n)] end if - do i = 2, n - itmp = ind(i) - j = i - 1 - do while (j >= 1) - if (arr(ind(j)) <= arr(itmp)) exit - ind(j + 1) = ind(j) + allocate(tmparr, source=arr) + call qsort_index_DP(tmparr, ind) + + return + end subroutine util_sort_index_dp + + + recursive pure subroutine qsort_index_DP(arr, ind) + !! author: David A. Minton + !! + !! Sort input DP precision array by index in ascending numerical order using quicksort sort. + implicit none + real(DP), dimension(:), intent(inout) :: arr + integer(I4B),dimension(:),intent(out) :: ind + integer :: iq + + if (size(arr) > 1) then + call partition_DP(arr, iq, ind) + call qsort_index_DP(arr(:iq-1),ind(:iq-1)) + call qsort_index_DP(arr(iq:), ind(iq:)) + end if + + return + end subroutine qsort_index_DP + + + pure subroutine partition_DP(arr, marker, ind) + !! author: David A. Minton + !! + !! Partition function for quicksort on DP type + real(DP), intent(inout), dimension(:) :: arr + integer(I4B), intent(inout), dimension(:), optional :: ind + integer(I4B), intent(out) :: marker + integer(I4B) :: i, j, itmp + real(DP) :: temp + real(DP) :: x ! pivot point + + x = arr(1) + i = 0 + j = size(arr) + 1 + + do + j = j - 1 + do + if (arr(j) <= x) exit j = j - 1 end do - ind(j + 1) = itmp + i = i + 1 + do + if (arr(i) >= x) exit + i = i + 1 + end do + if (i < j) then + ! exchange A(i) and A(j) + temp = arr(i) + arr(i) = arr(j) + arr(j) = temp + itmp = ind(i) + ind(i) = ind(j) + ind(j) = itmp + else if (i == j) then + marker = i + 1 + return + else + marker = i + return + endif end do - + return - end subroutine util_sort_index_dp - + end subroutine Partition_index_DP + module pure subroutine util_sort_i4b(arr) !! author: David A. Minton From c37c73d756ea1c7c15c1106319e7bfc73ee3c4af Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 16:35:04 -0400 Subject: [PATCH 07/16] Fixed typo in subroutine name --- src/util/util_sort.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index 9cf89faee..a662eb52c 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -177,7 +177,7 @@ pure subroutine partition_DP(arr, marker, ind) end do return - end subroutine Partition_index_DP + end subroutine Partition_DP module pure subroutine util_sort_i4b(arr) From 13aa25533e43b92f0cf5d1bdce97f07414afa1a0 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 17:53:05 -0400 Subject: [PATCH 08/16] Improved performance of sweep phase --- src/encounter/encounter_check.f90 | 66 ++++++++++++++++++++++--------- src/util/util_sort.f90 | 31 ++++++++++----- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 93580eb33..9c5570508 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -971,6 +971,7 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr integer(I4B), dimension(:), allocatable :: ibeg, iend + type(walltimer) :: timer ntot = n1 + n2 call util_index_array(ind_arr, ntot) @@ -982,16 +983,28 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure + call timer%reset() + call timer%start() call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) - + call timer%stop() + write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step + + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) + call timer%stop() + write(*,*) "collapse rag: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() ! Reorder the pairs and sort the first index in order to remove any duplicates do concurrent(k = 1:nenc, index2(k) < index1(k)) i = index2(k) index2(k) = index1(k) index1(k) = i end do + call timer%stop() + write(*,*) "reorder arr : ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_sweep_aabb_double_list @@ -1058,10 +1071,9 @@ subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibeg, ien integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi ntot = n1 + n2 - !$omp parallel do default(private) schedule(guided)& + !$omp parallel do default(private) schedule(dynamic)& !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & - !$omp firstprivate(ntot, n1, n2) & - !$omp lastprivate(ibegi, iendi) + !$omp firstprivate(ntot, n1, n2) do i = 1, ntot ibegi(1) = ibeg(1,i) + 1 iendi(1) = iend(1,i) - 1 @@ -1096,10 +1108,10 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, in logical, dimension(n) :: lencounteri integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi - !$omp parallel do default(private) schedule(guided)& + !$omp parallel do default(private) schedule(dynamic)& !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & !$omp firstprivate(n) & - !$omp lastprivate(ibegi, iendi) + !$omp lastprivate(ibegi, iendi, lencounteri) do i = 1, n ibegi(1) = ibeg(1,i) + 1 iendi(1) = iend(1,i) - 1 @@ -1118,7 +1130,7 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, in end subroutine encounter_check_sweep_aabb_all_single_list - pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) + subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Encounters from the same lists not allowed (e.g. pl-tp encounters only) @@ -1133,14 +1145,20 @@ pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox, dim - logical, dimension(SWEEPDIM) :: lenc + integer(I4B) :: j, jbox, dim, jlo, jhi + logical, dimension(2:SWEEPDIM) :: lenc + integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. - lenc(1) = .true. - do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval - j = ext_ind(jbox) - if (j > ntot) j = j - ntot ! If this is an endpoint index, shift it to the correct range + jlo = ibegi(1) + jhi = iendi(1) + allocate(box(jlo:jhi)) + box(:) = ext_ind(jlo:jhi) + where(box(:) > ntot) + box(:) = box(:) - ntot + endwhere + do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + j = box(jbox) if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed ! Check the y-dimension do dim = 2, SWEEPDIM @@ -1165,14 +1183,24 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the x-dimension logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox, dim - logical, dimension(SWEEPDIM) :: lenc + integer(I4B) :: j, jbox, dim, jlo, jhi + logical, dimension(2:SWEEPDIM) :: lenc + integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. - lenc(1) = .true. - do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval - j = ext_ind(jbox) - if (j > n) j = j - n ! If this is an endpoint index, shift it to the correct range + + jlo = ibegi(1) + jhi = iendi(1) + + allocate(box(jlo:jhi)) + box(:) = ext_ind(jlo:jhi) + + where(box(:) > n) + box(:) = box(:) - n + endwhere + + do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + j = box(jbox) ! Check the other dimensions do dim = 2, SWEEPDIM lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index a662eb52c..1b09b84f7 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -118,9 +118,12 @@ recursive pure subroutine qsort_index_DP(arr, ind) !! author: David A. Minton !! !! Sort input DP precision array by index in ascending numerical order using quicksort sort. + !! implicit none + ! Arguments real(DP), dimension(:), intent(inout) :: arr integer(I4B),dimension(:),intent(out) :: ind + !! Internals integer :: iq if (size(arr) > 1) then @@ -137,16 +140,24 @@ pure subroutine partition_DP(arr, marker, ind) !! author: David A. Minton !! !! Partition function for quicksort on DP type - real(DP), intent(inout), dimension(:) :: arr + !! + implicit none + ! Arguments + real(DP), intent(inout), dimension(:) :: arr integer(I4B), intent(inout), dimension(:), optional :: ind - integer(I4B), intent(out) :: marker - integer(I4B) :: i, j, itmp + integer(I4B), intent(out) :: marker + ! Internals + integer(I4B) :: i, j, itmp, narr, ipiv real(DP) :: temp - real(DP) :: x ! pivot point + real(DP) :: x ! pivot point + + narr = size(arr) - x = arr(1) + ! Get center as pivot, as this is likely partially sorted + ipiv = narr / 2 + x = arr(ipiv) i = 0 - j = size(arr) + 1 + j = narr + 1 do j = j - 1 @@ -164,9 +175,11 @@ pure subroutine partition_DP(arr, marker, ind) temp = arr(i) arr(i) = arr(j) arr(j) = temp - itmp = ind(i) - ind(i) = ind(j) - ind(j) = itmp + if (present(ind)) then + itmp = ind(i) + ind(i) = ind(j) + ind(j) = itmp + end if else if (i == j) then marker = i + 1 return From 4032d6b2d667a7baa922f941a7b6b3fd2d0d1391 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 22:30:08 -0400 Subject: [PATCH 09/16] streamlined sweep step --- src/encounter/encounter_check.f90 | 25 +++++-------------------- src/modules/encounter_classes.f90 | 2 +- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 9c5570508..e178aa4f6 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -989,22 +989,14 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind call timer%stop() write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) - call timer%stop() - write(*,*) "collapse rag: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() ! Reorder the pairs and sort the first index in order to remove any duplicates do concurrent(k = 1:nenc, index2(k) < index1(k)) i = index2(k) index2(k) = index1(k) index1(k) = i end do - call timer%stop() - write(*,*) "reorder arr : ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_sweep_aabb_double_list @@ -1146,7 +1138,6 @@ subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals integer(I4B) :: j, jbox, dim, jlo, jhi - logical, dimension(2:SWEEPDIM) :: lenc integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. @@ -1157,14 +1148,12 @@ subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, where(box(:) > ntot) box(:) = box(:) - ntot endwhere - do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + + do concurrent (jbox = jlo:jhi) j = box(jbox) if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed - ! Check the y-dimension - do dim = 2, SWEEPDIM - lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) - end do - lencounteri(j) = all(lenc(:)) + ! Check the other dimensions + lencounteri(j) = all((iend(2:SWEEPDIM,j) > ibegi(2:SWEEPDIM)) .and. (ibeg(2:SWEEPDIM,j) < iendi(2:SWEEPDIM))) end do return @@ -1184,7 +1173,6 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals integer(I4B) :: j, jbox, dim, jlo, jhi - logical, dimension(2:SWEEPDIM) :: lenc integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. @@ -1202,10 +1190,7 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval j = box(jbox) ! Check the other dimensions - do dim = 2, SWEEPDIM - lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) - end do - lencounteri(j) = all(lenc(:)) + lencounteri(j) = all((iend(2:SWEEPDIM,j) > ibegi(2:SWEEPDIM)) .and. (ibeg(2:SWEEPDIM,j) < iendi(2:SWEEPDIM))) end do return diff --git a/src/modules/encounter_classes.f90 b/src/modules/encounter_classes.f90 index 4965f14f1..1c182619b 100644 --- a/src/modules/encounter_classes.f90 +++ b/src/modules/encounter_classes.f90 @@ -7,7 +7,7 @@ module encounter_classes implicit none public - integer(I4B), parameter :: SWEEPDIM = 2 + integer(I4B), parameter :: SWEEPDIM = 3 type :: encounter_list integer(I4B) :: nenc = 0 !! Total number of encounters From 1bd61ce897074a25a28e60df026a81858910adb9 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Thu, 4 Nov 2021 16:16:37 -0400 Subject: [PATCH 10/16] Removed -parallel flag, as it seems like it may actually be slowing the runs down --- Makefile.Defines | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.Defines b/Makefile.Defines index c2d0258d5..0da15157f 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -54,7 +54,7 @@ VTUNE_FLAGS = -g -O2 -qopt-report=5 -simd -shared-intel -qopenmp -debug inline-d IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin STRICTREAL = -fp-model=precise -prec-div -prec-sqrt -assume protect-parens SIMDVEC = -simd -xhost -align all -assume contiguous_assumed_shape -vecabi=cmdtarget -fp-model no-except -fma -PAR = -qopenmp -parallel +PAR = -qopenmp HEAPARR = -heap-arrays 4194304 OPTREPORT = -qopt-report=5 IPRODUCTION = -no-wrap-margin -O3 -qopt-prefetch=0 -sox $(PAR) $(SIMDVEC) #$(HEAPARR) From e59395e0e50f8bab5dbfec15b6d90d49da27e1a4 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 10:42:53 -0400 Subject: [PATCH 11/16] Simplified string handling in NetCDF writes --- src/netcdf/netcdf.f90 | 54 ++++++++++--------------------------------- 1 file changed, 12 insertions(+), 42 deletions(-) diff --git a/src/netcdf/netcdf.f90 b/src/netcdf/netcdf.f90 index 9ad4e38be..4ff68f74a 100644 --- a/src/netcdf/netcdf.f90 +++ b/src/netcdf/netcdf.f90 @@ -457,7 +457,6 @@ module function netcdf_read_frame_system(self, iu, param) result(ierr) integer(I4B) :: ierr !! Error code: returns 0 if the read is successful ! Internals integer(I4B) :: dim, i, j, tslot, idmax, npl_check, ntp_check - character(len=:), allocatable :: charstring real(DP), dimension(:), allocatable :: rtemp integer(I4B), dimension(:), allocatable :: itemp logical, dimension(:), allocatable :: validmask, tpmask, plmask @@ -690,10 +689,7 @@ module subroutine netcdf_read_particle_info_system(self, iu, param, plmask, tpma logical, dimension(:), intent(in) :: plmask !! Logical array indicating which index values belong to massive bodies logical, dimension(:), intent(in) :: tpmask !! Logical array indicating which index values belong to test particles ! Internals - integer(I4B) :: i, j, tslot, strlen, idslot, old_mode, idmax - character(len=:), allocatable :: charstring - character(len=NAMELEN) :: emptystr, lenstr - character(len=:), allocatable :: fmtlabel + integer(I4B) :: i, j, tslot, idslot, old_mode, idmax real(DP), dimension(:), allocatable :: rtemp real(DP), dimension(:,:), allocatable :: rtemp_arr integer(I4B), dimension(:), allocatable :: itemp @@ -701,10 +697,6 @@ module subroutine netcdf_read_particle_info_system(self, iu, param, plmask, tpma integer(I4B), dimension(:), allocatable :: plind, tpind ! This string of spaces of length NAMELEN is used to clear out any old data left behind inside the string variables - write(lenstr, *) NAMELEN - fmtlabel = "(A" // trim(adjustl(lenstr)) // ")" - write(emptystr, fmtlabel) " " - idmax = size(plmask) allocate(rtemp(idmax)) allocate(rtemp_arr(NDIM,idmax)) @@ -873,9 +865,8 @@ module subroutine netcdf_write_frame_base(self, iu, param) class(netcdf_parameters), intent(inout) :: iu !! Parameters used to identify a particular NetCDF dataset class(swiftest_parameters), intent(inout) :: param !! Current run configuration parameters ! Internals - integer(I4B) :: i, j, tslot, strlen, idslot, old_mode + integer(I4B) :: i, j, tslot, idslot, old_mode integer(I4B), dimension(:), allocatable :: ind - character(len=:), allocatable :: charstring call self%write_particle_info(iu, param) @@ -987,17 +978,12 @@ module subroutine netcdf_write_particle_info_base(self, iu, param) class(netcdf_parameters), intent(inout) :: iu !! Parameters used to identify a particular NetCDF dataset class(swiftest_parameters), intent(inout) :: param !! Current run configuration parameters ! Internals - integer(I4B) :: i, j, tslot, strlen, idslot, old_mode + integer(I4B) :: i, j, tslot, idslot, old_mode integer(I4B), dimension(:), allocatable :: ind - character(len=:), allocatable :: charstring - character(len=NAMELEN) :: emptystr, lenstr - character(len=:), allocatable :: fmtlabel + character(len=NAMELEN) :: charstring ! This string of spaces of length NAMELEN is used to clear out any old data left behind inside the string variables call check( nf90_set_fill(iu%ncid, nf90_nofill, old_mode) ) - write(lenstr, *) NAMELEN - fmtlabel = "(A" // trim(adjustl(lenstr)) // ")" - write(emptystr, fmtlabel) " " select type(self) class is (swiftest_body) @@ -1011,25 +997,17 @@ module subroutine netcdf_write_particle_info_base(self, iu, param) call check( nf90_put_var(iu%ncid, iu%id_varid, self%id(j), start=[idslot]) ) charstring = trim(adjustl(self%info(j)%name)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%name_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%name_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%name_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) charstring = trim(adjustl(self%info(j)%particle_type)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%ptype_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%ptype_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%ptype_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) charstring = trim(adjustl(self%info(j)%status)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%status_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%status_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%status_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) if (param%lclose) then charstring = trim(adjustl(self%info(j)%origin_type)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%origin_type_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%origin_type_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%origin_type_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) call check( nf90_put_var(iu%ncid, iu%origin_time_varid, self%info(j)%origin_time, start=[idslot]) ) call check( nf90_put_var(iu%ncid, iu%origin_xhx_varid, self%info(j)%origin_xh(1), start=[idslot]) ) call check( nf90_put_var(iu%ncid, iu%origin_xhy_varid, self%info(j)%origin_xh(2), start=[idslot]) ) @@ -1056,25 +1034,17 @@ module subroutine netcdf_write_particle_info_base(self, iu, param) call check( nf90_put_var(iu%ncid, iu%id_varid, self%id, start=[idslot]) ) charstring = trim(adjustl(self%info%name)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%name_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%name_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%name_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) charstring = trim(adjustl(self%info%particle_type)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%ptype_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%ptype_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%ptype_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) charstring = trim(adjustl(self%info%status)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%status_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%status_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%status_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) if (param%lclose) then charstring = trim(adjustl(self%info%origin_type)) - strlen = len(charstring) - call check( nf90_put_var(iu%ncid, iu%origin_type_varid, emptystr, start=[1, idslot], count=[NAMELEN, 1]) ) - call check( nf90_put_var(iu%ncid, iu%origin_type_varid, charstring, start=[1, idslot], count=[strlen, 1]) ) + call check( nf90_put_var(iu%ncid, iu%origin_type_varid, charstring, start=[1, idslot], count=[NAMELEN, 1]) ) call check( nf90_put_var(iu%ncid, iu%origin_time_varid, self%info%origin_time, start=[idslot]) ) call check( nf90_put_var(iu%ncid, iu%origin_xhx_varid, self%info%origin_xh(1), start=[idslot]) ) From e99c418d996f363be7bb0606c0417be3affbf0b9 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 10:50:54 -0400 Subject: [PATCH 12/16] Put back parallel flag --- Makefile.Defines | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.Defines b/Makefile.Defines index 0da15157f..e3f1696e5 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -54,7 +54,7 @@ VTUNE_FLAGS = -g -O2 -qopt-report=5 -simd -shared-intel -qopenmp -debug inline-d IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin STRICTREAL = -fp-model=precise -prec-div -prec-sqrt -assume protect-parens SIMDVEC = -simd -xhost -align all -assume contiguous_assumed_shape -vecabi=cmdtarget -fp-model no-except -fma -PAR = -qopenmp +PAR = -qopenmp -parallel HEAPARR = -heap-arrays 4194304 OPTREPORT = -qopt-report=5 IPRODUCTION = -no-wrap-margin -O3 -qopt-prefetch=0 -sox $(PAR) $(SIMDVEC) #$(HEAPARR) From 4f9584890f2ea75a65c30b9f85b6cc499718361a Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 16:57:34 -0400 Subject: [PATCH 13/16] Fixed memory leaks. The main one was due to opening already-open NetCDF files when writing and reading headers --- src/encounter/encounter_check.f90 | 45 ++----------------------------- src/main/swiftest_driver.f90 | 2 ++ src/netcdf/netcdf.f90 | 4 --- src/symba/symba_util.f90 | 9 ++++++- 4 files changed, 12 insertions(+), 48 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 2bddc0517..b125d0ff9 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -402,8 +402,6 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, npl_last = npl end if - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(x, v, renc, boundingbox) & !$omp firstprivate(dt, npl, n) @@ -419,31 +417,17 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, x(dim,1:npl) + renc(1:npl) + vshift_max(1:npl) * v(dim,1:npl) * dt]) end do !$omp end parallel do - call timer%stop() - write(*,*) "plpl sort : ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call boundingbox%sweep(npl, nenc, index1, index2) - call timer%stop() - write(*,*) "plpl sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Now that we have identified potential pairs, use the narrow-phase process to get the final values allocate(lencounter(nenc)) allocate(lvdotr(nenc)) - call timer%reset() - call timer%start() call encounter_check_all(nenc, index1, index2, x, v, x, v, renc, renc, dt, lencounter, lvdotr) - call timer%stop() - write(*,*) "plpl check : ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_reduce_broadphase(npl, nenc, index1, index2, lencounter, lvdotr) - call timer%stop() - write(*,*) "plpl reduce: ",timer%count_stop_step - timer%count_start_step deallocate(lencounter) end if @@ -520,14 +504,8 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt xplt(dim,1:nplt) + renct(1:nplt) + vpltshift_max(1:nplt) * vplt(dim,1:nplt) * dt]) end do !$omp end parallel do - call timer%stop() - write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step - - call timer%reset() - call timer%start() + call boundingbox%sweep(nplm, nplt, nenc, index1, index2) - call timer%stop() - write(*,*) "plplm sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Shift tiny body indices back into the range of the input position and velocity arrays @@ -537,15 +515,9 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt allocate(lencounter(nenc)) allocate(lvdotr(nenc)) - call timer%reset() - call timer%start() call encounter_check_all(nenc, index1, index2, xplm, vplm, xplt, vplt, rencm, renct, dt, lencounter, lvdotr) - call timer%stop() - write(*,*) "plplm check : ",timer%count_stop_step - timer%count_start_step call encounter_check_reduce_broadphase(ntot, nenc, index1, index2, lencounter, lvdotr) - call timer%stop() - write(*,*) "plplm reduce: ",timer%count_stop_step - timer%count_start_step end if return end subroutine encounter_check_all_sort_and_sweep_plplm @@ -706,8 +678,6 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call util_index_array(ind_arr, npl) - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(x, v, renc, lenc, ind_arr) & !$omp firstprivate(npl, dt) @@ -719,8 +689,6 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde renc(i), renc(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do - call timer%stop() - write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) @@ -758,8 +726,6 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call util_index_array(ind_arr, nplt) - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(xplm, vplm, xplt, vplt, rencm, renct, lenc, ind_arr) & !$omp firstprivate(nplm, nplt, dt) @@ -771,8 +737,6 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp rencm(i), renct(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do - call timer%stop() - write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) @@ -991,7 +955,6 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr integer(I4B), dimension(:), allocatable :: ibeg, iend - type(walltimer) :: timer ntot = n1 + n2 call util_index_array(ind_arr, ntot) @@ -1005,12 +968,8 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - call timer%reset() - call timer%start() call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) - call timer%stop() - write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step - + call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) ! Reorder the pairs and sort the first index in order to remove any duplicates diff --git a/src/main/swiftest_driver.f90 b/src/main/swiftest_driver.f90 index 5fdb43c8b..d4e1f2dc8 100644 --- a/src/main/swiftest_driver.f90 +++ b/src/main/swiftest_driver.f90 @@ -123,6 +123,8 @@ program swiftest_driver end do end associate + call nbody_system%dealloc() + call util_exit(SUCCESS) stop diff --git a/src/netcdf/netcdf.f90 b/src/netcdf/netcdf.f90 index 9ad4e38be..951707477 100644 --- a/src/netcdf/netcdf.f90 +++ b/src/netcdf/netcdf.f90 @@ -650,8 +650,6 @@ module subroutine netcdf_read_hdr_system(self, iu, param) tslot = int(param%ioutput, kind=I4B) + 1 - call check( nf90_open(param%outfile, NF90_NOWRITE, iu%ncid) ) - call check( nf90_get_var(iu%ncid, iu%time_varid, param%t, start=[tslot]) ) call check( nf90_get_var(iu%ncid, iu%npl_varid, self%pl%nbody, start=[tslot]) ) call check( nf90_get_var(iu%ncid, iu%ntp_varid, self%tp%nbody, start=[tslot]) ) @@ -1117,8 +1115,6 @@ module subroutine netcdf_write_hdr_system(self, iu, param) tslot = int(param%ioutput, kind=I4B) + 1 - call check( nf90_open(param%outfile, nf90_write, iu%ncid) ) - call check( nf90_put_var(iu%ncid, iu%time_varid, param%t, start=[tslot]) ) call check( nf90_put_var(iu%ncid, iu%npl_varid, self%pl%nbody, start=[tslot]) ) call check( nf90_put_var(iu%ncid, iu%ntp_varid, self%tp%nbody, start=[tslot]) ) diff --git a/src/symba/symba_util.f90 b/src/symba/symba_util.f90 index f429b74b7..50c3c4e4b 100644 --- a/src/symba/symba_util.f90 +++ b/src/symba/symba_util.f90 @@ -251,6 +251,8 @@ module subroutine symba_util_dealloc_pl(self) implicit none ! Arguments class(symba_pl), intent(inout) :: self !! SyMBA massive body object + ! Internals + integer(I4B) :: i if (allocated(self%lcollision)) deallocate(self%lcollision) if (allocated(self%lencounter)) deallocate(self%lencounter) @@ -262,6 +264,11 @@ module subroutine symba_util_dealloc_pl(self) if (allocated(self%isperi)) deallocate(self%isperi) if (allocated(self%peri)) deallocate(self%peri) if (allocated(self%atp)) deallocate(self%atp) + + do i = 1, self%nbody + call self%kin(i)%dealloc() + end do + if (allocated(self%kin)) deallocate(self%kin) call util_dealloc_pl(self) @@ -408,7 +415,7 @@ module subroutine symba_util_flatten_eucl_plpl(self, param) if (param%lflatten_interactions) then if (allocated(self%k_plpl)) deallocate(self%k_plpl) ! Reset the index array if it's been set previously allocate(self%k_plpl(2, nplpl), stat=err) - if (err /=0) then ! An error occurred trying to allocate this big array. This probably means it's too big to fit in memory, and so we will force the run back into triangular mode + if (err /= 0) then ! An error occurred trying to allocate this big array. This probably means it's too big to fit in memory, and so we will force the run back into triangular mode param%lflatten_interactions = .false. else do concurrent (i=1:npl, j=1:npl, j>i) From c35492eb10fbc0e8fdb769d0ea17029ee73e6017 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 17:13:34 -0400 Subject: [PATCH 14/16] Fixed deallocation error for kinship array --- src/symba/symba_util.f90 | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/symba/symba_util.f90 b/src/symba/symba_util.f90 index 50c3c4e4b..d38106705 100644 --- a/src/symba/symba_util.f90 +++ b/src/symba/symba_util.f90 @@ -265,11 +265,12 @@ module subroutine symba_util_dealloc_pl(self) if (allocated(self%peri)) deallocate(self%peri) if (allocated(self%atp)) deallocate(self%atp) - do i = 1, self%nbody - call self%kin(i)%dealloc() - end do - - if (allocated(self%kin)) deallocate(self%kin) + if (allocated(self%kin)) then + do i = 1, self%nbody + call self%kin(i)%dealloc() + end do + deallocate(self%kin) + end if call util_dealloc_pl(self) From 79a42f3684c6964405ce2ba6b842229dd2550ca9 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 17:31:05 -0400 Subject: [PATCH 15/16] Fixed memory leak caused by multiple opening of NetCDF file on a restart --- src/io/io.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/io/io.f90 b/src/io/io.f90 index 62e77e38b..ebdd68531 100644 --- a/src/io/io.f90 +++ b/src/io/io.f90 @@ -2151,7 +2151,6 @@ module subroutine io_write_frame_system(self, param) errmsg = param%outfile // " not found! You must specify OUT_STAT = NEW, REPLACE, or UNKNOWN" goto 667 end if - call param%nciu%open(param) case('NEW') if (fileExists) then errmsg = param%outfile // " Alread Exists! You must specify OUT_STAT = APPEND, REPLACE, or UNKNOWN" From 63684b0f3416125815dcb06503f0a1b0ddd9ee4b Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 22:41:00 -0400 Subject: [PATCH 16/16] Updated Makefiles on debug branch --- Makefile | 18 +++++++++--------- Makefile.Defines | 20 +++++++++++++++----- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index d1da4b19f..7ac420ee3 100644 --- a/Makefile +++ b/Makefile @@ -65,8 +65,8 @@ MODULES = $(SWIFTEST_MODULES) $(USER_MODULES) .PHONY : all mod fast strict drivers bin clean force % : %.f90 force - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) $< -o $@ \ - -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) $(LADVIXE) + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $< -o $@ \ + -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) $(INSTALL_PROGRAM) $@ $(SWIFTEST_HOME)/bin rm -f $@ @@ -79,7 +79,7 @@ all: mod: cd $(SWIFTEST_HOME)/src/modules/; \ - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c $(MODULES); \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c $(MODULES); \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o; \ $(INSTALL_DATA) *.mod *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.mod *.smod @@ -188,34 +188,34 @@ strict: ln -s $(SWIFTEST_HOME)/Makefile .; \ make strictdir cd $(SWIFTEST_HOME)/src/helio; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c helio_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c helio_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/symba; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c symba_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c symba_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/rmvs; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c rmvs_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c rmvs_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/whm; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c whm_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c whm_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod fastdir: - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod strictdir: - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smo diff --git a/Makefile.Defines b/Makefile.Defines index 4bb4e55a8..c943d55d5 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -45,12 +45,13 @@ COLLRESOLVE_HOME = $(ROOT_DIR)/collresolve/ # DO NOT include in FFLAGS the "-c" option to compile object only # this is done explicitly as needed in the Makefile -ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vec -shared-intel -ldl -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -traceback +ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vecabi=cmdtarget -simd -shared-intel -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -xhost -traceback #Be sure to set the environment variable KMP_FORKJOIN_FRAMES=1 for OpenMP debuging in vtune +IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin STRICTREAL = -fp-model=precise -prec-div -prec-sqrt -assume protect-parens SIMDVEC = -simd -xhost -align all -assume contiguous_assumed_shape -vecabi=cmdtarget -fp-model no-except -fma -PAR = -qopenmp -parallel +PAR = -qopenmp -parallel HEAPARR = -heap-arrays 4194304 OPTREPORT = -qopt-report=5 IPRODUCTION = -no-wrap-margin -O3 -qopt-prefetch=0 -sox $(PAR) $(SIMDVEC) #$(HEAPARR) @@ -62,17 +63,26 @@ GMEM = -fsanitize-address-use-after-scope -fstack-check -fsanitize=bounds-stri GWARNINGS = -Wall -Warray-bounds -Wimplicit-interface -Wextra -Warray-temporaries GPRODUCTION = -O3 -ffree-line-length-none $(GPAR) + MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul -FSTRICTFLAGS = $(IPRODUCTION) $(STRICTREAL) -FFLAGS = $(IPRODUCTION) -fp-model=fast +FSTRICTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) +FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) FORTRAN = ifort AR = xiar CC = icc -CC = icc +#FORTRAN = gfortran +#FFLAGS = $(GDEBUG) $(GMEM) #$(GPAR) +#FSTRICTFLAGS = $(GDEBUG) $(GMEM) #$(GPAR) +#AR = ar +#CC = cc + +# DO NOT include in CFLAGS the "-c" option to compile object only +# this is done explicitly as needed in the Makefile + CFLAGS = -O3 -w -m64 -std=c99 64_BIT_REALS = -r8