From 51cd620f459251e583119e03106100ec7ef9d9b5 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 24 Sep 2021 13:54:46 -0400 Subject: [PATCH 1/9] Created Makefiles tailored to work well for the Intel Advisor profiler --- Makefile | 20 +++++++++++--------- Makefile.Defines | 12 +++++------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 34df2b87a..4ee3be240 100644 --- a/Makefile +++ b/Makefile @@ -62,14 +62,16 @@ include Makefile.Defines MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul +IADVIXE = -I$(ADVISOR_2019_DIR)/include/ia64 +LADVIXE = -L$(ADVISOR_2019_DIR)/lib64 MODULES = $(SWIFTEST_MODULES) $(USER_MODULES) .PHONY : all mod fast strict drivers bin clean force % : %.f90 force - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $< -o $@ \ - -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) $< -o $@ \ + -L$(SWIFTEST_HOME)/lib -lswiftest -L$(NETCDF_FORTRAN_HOME)/lib -lnetcdf -lnetcdff $(LMKL) $(LADVIXE) $(INSTALL_PROGRAM) $@ $(SWIFTEST_HOME)/bin rm -f $@ @@ -82,7 +84,7 @@ all: mod: cd $(SWIFTEST_HOME)/src/modules/; \ - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c $(MODULES); \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c $(MODULES); \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o; \ $(INSTALL_DATA) *.mod *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.mod *.smod @@ -186,34 +188,34 @@ strict: ln -s $(SWIFTEST_HOME)/Makefile .; \ make strictdir cd $(SWIFTEST_HOME)/src/helio; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c helio_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c helio_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/symba; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c symba_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c symba_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/rmvs; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c rmvs_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c rmvs_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod cd $(SWIFTEST_HOME)/src/whm; \ - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c whm_kick.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c whm_kick.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod fastdir: - $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ + $(FORTRAN) $(FFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smod strictdir: - $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) -c *.f90; \ + $(FORTRAN) $(FSTRICTFLAGS) -I$(SWIFTEST_HOME)/include -I$(NETCDF_FORTRAN_HOME)/include $(IMKL) $(IADVIXE) -c *.f90; \ $(AR) rv $(SWIFTEST_HOME)/lib/libswiftest.a *.o *.smod; \ $(INSTALL_DATA) *.smod $(SWIFTEST_HOME)/include; \ rm -f *.o *.smo diff --git a/Makefile.Defines b/Makefile.Defines index 278388cb0..54fdbe3f7 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -45,10 +45,7 @@ COLLRESOLVE_HOME = $(ROOT_DIR)/collresolve/ # DO NOT include in FFLAGS the "-c" option to compile object only # this is done explicitly as needed in the Makefile -ADVIXE_DIR = /apps/cent7/intel/advisor_2019 -ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vecabi=cmdtarget -simd -shared-intel -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -xhost -traceback - -VTUNE_FLAGS = -g -O2 -qopt-report=5 -simd -shared-intel -qopenmp -debug inline-debug-info -parallel-source-info=2 -parallel -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -qopenmp -fp-model no-except -mp1 -xhost -traceback +ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vec -shared-intel -ldl -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -traceback #Be sure to set the environment variable KMP_FORKJOIN_FRAMES=1 for OpenMP debuging in vtune IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin @@ -69,14 +66,15 @@ GPRODUCTION = -O2 -ffree-line-length-none $(GPAR) #FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) #FFASTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) -FSTRICTFLAGS = $(IPRODUCTION) $(STRICTREAL) $(OPTREPORT) #$(ADVIXE_FLAGS) -FFLAGS = $(IPRODUCTION) -fp-model=fast $(OPTREPORT) #$(ADVIXE_FLAGS) +FSTRICTFLAGS = $(ADVIXE_FLAGS) $(STRICTREAL) $(SIMDVEC) $(PAR) +FFLAGS = $(ADVIXE_FLAGS) -fp-model=fast $(SIMDVEC) $(PAR) FORTRAN = ifort AR = xiar #FORTRAN = gfortran #FFLAGS = $(GDEBUG) # $(GMEM) $(GPAR) -#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only +#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall +#AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only # this is done explicitly as needed in the Makefile CC = icc From a703880e054c344c52c66b3af46f5c4f3e8b7ef2 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Wed, 20 Oct 2021 13:25:56 -0400 Subject: [PATCH 2/9] Streamlined compiler arguments --- Makefile.Defines | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/Makefile.Defines b/Makefile.Defines index 3aaa18138..a763c316f 100644 --- a/Makefile.Defines +++ b/Makefile.Defines @@ -48,39 +48,20 @@ COLLRESOLVE_HOME = $(ROOT_DIR)/collresolve/ ADVIXE_FLAGS = -g -O2 -qopt-report=5 -vec -shared-intel -ldl -debug inline-debug-info -DTBB_DEBUG -DTBB_USE_THREADING_TOOLS -traceback #Be sure to set the environment variable KMP_FORKJOIN_FRAMES=1 for OpenMP debuging in vtune -IDEBUG = -O0 -init=snan,arrays -nogen-interfaces -no-pie -no-ftz -fpe-all=0 -g -traceback -mp1 -fp-model strict -fpe0 -debug all -align all -pad -ip -prec-div -prec-sqrt -assume protect-parens -CB -no-wrap-margin STRICTREAL = -fp-model=precise -prec-div -prec-sqrt -assume protect-parens SIMDVEC = -simd -xhost -align all -assume contiguous_assumed_shape -vecabi=cmdtarget -fp-model no-except -fma PAR = -qopenmp -parallel -HEAPARR = -heap-arrays 4194304 -OPTREPORT = -qopt-report=5 -IPRODUCTION = -no-wrap-margin -O3 -qopt-prefetch=0 -sox $(PAR) $(SIMDVEC) #$(HEAPARR) - -#gfortran flags -GDEBUG = -g -Og -fbacktrace -fbounds-check -ffree-line-length-none -GPAR = -fopenmp -ftree-parallelize-loops=4 -GMEM = -fsanitize-address-use-after-scope -fstack-check -fsanitize=bounds-strict -fsanitize=undefined -fsanitize=signed-integer-overflow -fsanitize=object-size -fstack-protector-all -GWARNINGS = -Wall -Warray-bounds -Wimplicit-interface -Wextra -Warray-temporaries -GPRODUCTION = -O3 -ffree-line-length-none $(GPAR) MKL_ROOT = /apps/spack/bell/apps/intel-parallel-studio/cluster.2019.5-intel-19.0.5-4brgqlf/mkl/lib IMKL = -I$(MKLROOT)/include LMKL = -L$(MKLROOT)/lib/intel64 -qopt-matmul -#FFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) -#FFASTFLAGS = $(IDEBUG) #$(SIMDVEC) $(PAR) + FSTRICTFLAGS = $(ADVIXE_FLAGS) $(STRICTREAL) $(SIMDVEC) $(PAR) FFLAGS = $(ADVIXE_FLAGS) -fp-model=fast $(SIMDVEC) $(PAR) FORTRAN = ifort AR = xiar -#FORTRAN = gfortran -#FFLAGS = $(GDEBUG) # $(GMEM) $(GPAR) -#FFLAGS = $(GPRODUCTION) -g -fbacktrace #-fcheck=all #-Wall -#AR = ar # DO NOT include in CFLAGS the "-c" option to compile object only -# this is done explicitly as needed in the Makefile - -#CC = icc -CC = cc +CC = icc CFLAGS = -O3 -w -m64 -std=c99 64_BIT_REALS = -r8 From 5d9859d6ef8be7a7d8e0c0523a1594a7b2aaf95b Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 03:00:30 -0400 Subject: [PATCH 3/9] Added timing instrumentation to encounter checks --- src/encounter/encounter_check.f90 | 48 +++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index c64dbd68a..f32cd1db7 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -377,6 +377,8 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, npl_last = npl end if + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(x, v, renc, boundingbox) & !$omp firstprivate(dt, npl, n) @@ -392,17 +394,31 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, x(dim,1:npl) + renc(1:npl) + vshift_max(1:npl) * v(dim,1:npl) * dt]) end do !$omp end parallel do + call timer%stop() + write(*,*) "plpl sort : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call boundingbox%sweep(npl, nenc, index1, index2) + call timer%stop() + write(*,*) "plpl sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Now that we have identified potential pairs, use the narrow-phase process to get the final values allocate(lencounter(nenc)) allocate(lvdotr(nenc)) + call timer%reset() + call timer%start() call encounter_check_all(nenc, index1, index2, x, v, x, v, renc, renc, dt, lencounter, lvdotr) + call timer%stop() + write(*,*) "plpl check : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_reduce_broadphase(npl, nenc, index1, index2, lencounter, lvdotr) + call timer%stop() + write(*,*) "plpl reduce: ",timer%count_stop_step - timer%count_start_step deallocate(lencounter) end if @@ -447,9 +463,7 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt ntot = nplm + nplt n = 2 * ntot if (ntot /= ntot_last) then - call boundingbox%setup(ntot, ntot_last) - ntot_last = ntot end if @@ -479,8 +493,14 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt xplt(dim,1:nplt) + renct(1:nplt) + vpltshift_max(1:nplt) * vplt(dim,1:nplt) * dt]) end do !$omp end parallel do + call timer%stop() + write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call boundingbox%sweep(nplm, nplt, nenc, index1, index2) + call timer%stop() + write(*,*) "plplm sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Shift tiny body indices back into the range of the input position and velocity arrays @@ -490,12 +510,20 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt allocate(lencounter(nenc)) allocate(lvdotr(nenc)) + call timer%reset() + call timer%start() call encounter_check_all(nenc, index1, index2, xplm, vplm, xplt, vplt, rencm, renct, dt, lencounter, lvdotr) + call timer%stop() + write(*,*) "plplm check : ",timer%count_stop_step - timer%count_start_step ! Shift the tiny body indices back to their natural range index2(:) = index2(:) + nplm + call timer%reset() + call timer%start() call encounter_check_reduce_broadphase(ntot, nenc, index1, index2, lencounter, lvdotr) + call timer%stop() + write(*,*) "plplm reduce: ",timer%count_stop_step - timer%count_start_step end if return end subroutine encounter_check_all_sort_and_sweep_plplm @@ -656,6 +684,8 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call util_index_array(ind_arr, npl) + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(x, v, renc, lenc, ind_arr) & !$omp firstprivate(npl, dt) @@ -667,8 +697,14 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde renc(i), renc(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do + call timer%stop() + write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) + call timer%stop() + write(*,*) "plpl tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plpl @@ -704,6 +740,8 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call util_index_array(ind_arr, nplt) + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(xplm, vplm, xplt, vplt, rencm, renct, lenc, ind_arr) & !$omp firstprivate(nplm, nplt, dt) @@ -715,8 +753,14 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp rencm(i), renct(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do + call timer%stop() + write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) + call timer%stop() + write(*,*) "plplm tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plplm From 5ce265567e55dea659a01e4b179ca00c1f16cf9d Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 03:06:18 -0400 Subject: [PATCH 4/9] Added missing timer reset --- src/encounter/encounter_check.f90 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index f32cd1db7..c9602f995 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -467,6 +467,8 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt ntot_last = ntot end if + call timer%reset() + call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(xplm, xplt, vplm, vplt, rencm, renct, boundingbox) & !$omp firstprivate(dt, nplm, nplt, ntot) From ed2bdafdba8e0184c9e1cb5eeec9756850110ec2 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 16:34:21 -0400 Subject: [PATCH 5/9] Swapped insertion sort for quicksort for better performance in the sort/sweep algorithm --- src/encounter/encounter_check.f90 | 155 ++++++++++++++++-------------- src/util/util_sort.f90 | 85 +++++++++++++--- 2 files changed, 155 insertions(+), 85 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index c9602f995..93580eb33 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -497,7 +497,7 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt !$omp end parallel do call timer%stop() write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step - + call timer%reset() call timer%start() call boundingbox%sweep(nplm, nplt, nenc, index1, index2) @@ -702,11 +702,7 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call timer%stop() write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) - call timer%stop() - write(*,*) "plpl tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plpl @@ -758,11 +754,7 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call timer%stop() write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) - call timer%stop() - write(*,*) "plplm tricol: ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_all_triangular_plplm @@ -975,16 +967,22 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind integer(I4B), dimension(:), allocatable, intent(out) :: index1 !! List of indices for body 1 in each encounter candidate pair integer(I4B), dimension(:), allocatable, intent(out) :: index2 !! List of indices for body 2 in each encounter candidate pair !Internals - integer(I4B) :: i, k, ntot + integer(I4B) :: i, k, ntot, dim type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr + integer(I4B), dimension(:), allocatable :: ibeg, iend ntot = n1 + n2 call util_index_array(ind_arr, ntot) + allocate(ibeg(SWEEPDIM * ntot)) + allocate(iend(SWEEPDIM * ntot)) + do dim = 1, SWEEPDIM + ibeg((dim - 1) * ntot + 1:dim * ntot) = self%aabb(dim)%ibeg(:) + iend((dim - 1) * ntot + 1:dim * ntot) = self%aabb(dim)%iend(:) + end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - - call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), self%aabb(1)%ibeg(:), self%aabb(1)%iend(:), self%aabb(2)%ibeg(:), self%aabb(2)%iend(:), ind_arr(:), lenc(:)) + call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) @@ -1012,16 +1010,23 @@ module subroutine encounter_check_sweep_aabb_single_list(self, n, nenc, index1, integer(I4B), dimension(:), allocatable, intent(out) :: index1 !! List of indices for body 1 in each encounter candidate pair integer(I4B), dimension(:), allocatable, intent(out) :: index2 !! List of indices for body 2 in each encounter candidate pair !Internals - Integer(I4B) :: i, k + Integer(I4B) :: i, k, dim type(encounter_list), dimension(n) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr type(walltimer) :: timer + integer(I4B), dimension(:), allocatable :: ibeg, iend call util_index_array(ind_arr, n) + allocate(ibeg(SWEEPDIM * n)) + allocate(iend(SWEEPDIM * n)) + do dim = 1, SWEEPDIM + ibeg((dim - 1) * n + 1:dim * n) = self%aabb(dim)%ibeg(:) + iend((dim - 1) * n + 1:dim * n) = self%aabb(dim)%iend(:) + end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - call encounter_check_sweep_aabb_all_single_list(n, self%aabb(1)%ind(:), self%aabb(1)%ibeg(:), self%aabb(1)%iend(:), self%aabb(2)%ibeg(:), self%aabb(2)%iend(:), ind_arr(:), lenc(:)) + call encounter_check_sweep_aabb_all_single_list(n, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, n]), reshape(iend(:), [SWEEPDIM, n]), ind_arr(:), lenc(:)) call encounter_check_collapse_ragged_list(lenc, n, nenc, index1, index2) @@ -1036,34 +1041,34 @@ module subroutine encounter_check_sweep_aabb_single_list(self, n, nenc, index1, end subroutine encounter_check_sweep_aabb_single_list - subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) + subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibeg, iend, ind_arr, lenc) !! author: David A. Minton !! !! Performs the loop part of the sweep operation. Double list version (e.g. pl-tp or plm-plt) implicit none ! Arguments - integer(I4B), intent(in) :: n1, n2 !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes - type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body + integer(I4B), intent(in) :: n1, n2 !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions + integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes + type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body ! Internals integer(I4B) :: i, ntot logical, dimension(n1+n2) :: lencounteri - integer(I4B) :: ibegxi, iendxi, ibegyi, iendyi + integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi ntot = n1 + n2 !$omp parallel do default(private) schedule(guided)& - !$omp shared(ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) & - !$omp firstprivate(ntot, n1, n2) + !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & + !$omp firstprivate(ntot, n1, n2) & + !$omp lastprivate(ibegi, iendi) do i = 1, ntot - ibegxi = ibegx(i) + 1 - iendxi = iendx(i) - 1 - if (iendxi >= ibegxi) then - ibegyi = ibegy(i) - iendyi = iendy(i) - call encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind(:), ibegxi, iendxi, ibegyi, iendyi, ibegx(:), iendx(:), ibegy(:), iendy(:), lencounteri(:)) + ibegi(1) = ibeg(1,i) + 1 + iendi(1) = iend(1,i) - 1 + if (iendi(1) >= ibegi(1)) then + ibegi(2:SWEEPDIM) = ibeg(2:SWEEPDIM,i) + iendi(2:SWEEPDIM) = iend(2:SWEEPDIM,i) + call encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind(:), ibegi(:), iendi(:), ibeg(:,:), iend(:,:), lencounteri(:)) call encounter_check_make_ragged_list(lencounteri(:), ind_arr(:), lenc(i)%nenc, lenc(i)%index2) else lenc(i)%nenc = 0 @@ -1075,33 +1080,33 @@ subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibegx, ie end subroutine encounter_check_sweep_aabb_all_double_list - subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) + subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, ind_arr, lenc) !! author: David A. Minton !! !! Performs the loop part of the sweep operation. Single list version (e.g. pl-pl) implicit none ! Arguments - integer(I4B), intent(in) :: n !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes - type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body + integer(I4B), intent(in) :: n !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimension + integer(I4B), dimension(:), intent(in) :: ind_arr !! index array for mapping body 2 indexes + type(encounter_list), dimension(:), intent(inout) :: lenc !! Encounter list for the ith body ! Internals integer(I4B) :: i logical, dimension(n) :: lencounteri - integer(I4B) :: ibegxi, iendxi, ibegyi, iendyi + integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi !$omp parallel do default(private) schedule(guided)& - !$omp shared(ext_ind, ibegx, iendx, ibegy, iendy, ind_arr, lenc) & - !$omp firstprivate(n) + !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & + !$omp firstprivate(n) & + !$omp lastprivate(ibegi, iendi) do i = 1, n - ibegxi = ibegx(i) + 1 - iendxi = iendx(i) - 1 - if (iendxi >= ibegxi) then - ibegyi = ibegy(i) - iendyi = iendy(i) - call encounter_check_sweep_aabb_one_single_list(n, ext_ind(:), ibegxi, iendxi, ibegyi, iendyi, ibegx(:), iendx(:), ibegy(:), iendy(:), lencounteri(:)) + ibegi(1) = ibeg(1,i) + 1 + iendi(1) = iend(1,i) - 1 + if (iendi(1) >= ibegi(1)) then + ibegi(2:SWEEPDIM) = ibeg(2:SWEEPDIM,i) + iendi(2:SWEEPDIM) = iend(2:SWEEPDIM,i) + call encounter_check_sweep_aabb_one_single_list(n, ext_ind(:), ibegi(:), iendi(:), ibeg(:,:), iend(:,:), lencounteri(:)) call encounter_check_make_ragged_list(lencounteri(:), ind_arr(:), lenc(i)%nenc, lenc(i)%index2) else lenc(i)%nenc = 0 @@ -1113,60 +1118,66 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibegx, iendx, end subroutine encounter_check_sweep_aabb_all_single_list - pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegxi, iendxi, ibegyi, iendyi, ibegx, iendx, ibegy, iendy, lencounteri) + pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Encounters from the same lists not allowed (e.g. pl-tp encounters only) implicit none ! Arguments - integer(I4B), intent(in) :: i !! The current index of the ith body - integer(I4B), intent(in) :: n1 !! Number of bodies 1 - integer(I4B), intent(in) :: n2 !! Number of bodies 2 - integer(I4B), intent(in) :: ntot !! n1 + n2 - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), intent(in) :: ibegxi, iendxi !! The beginning and ending indices of the ith bounding box in the x-dimension - integer(I4B), intent(in) :: ibegyi, iendyi !! The beginning and ending indices of the ith bounding box in the y-dimension - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimensio - logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body + integer(I4B), intent(in) :: i !! The current index of the ith body + integer(I4B), intent(in) :: n1 !! Number of bodies 1 + integer(I4B), intent(in) :: n2 !! Number of bodies 2 + integer(I4B), intent(in) :: ntot !! n1 + n2 + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:), intent(in) :: ibegi, iendi !! The beginning and ending indices of the ith bounding box in the n-dimensions + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions + logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox + integer(I4B) :: j, jbox, dim + logical, dimension(SWEEPDIM) :: lenc lencounteri(:) = .false. - do concurrent(jbox = ibegxi:iendxi) ! Sweep forward until the end of the interval + lenc(1) = .true. + do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval j = ext_ind(jbox) if (j > ntot) j = j - ntot ! If this is an endpoint index, shift it to the correct range if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed ! Check the y-dimension - lencounteri(j) = (iendy(j) > ibegyi) .and. (ibegy(j) < iendyi) + do dim = 2, SWEEPDIM + lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) + end do + lencounteri(j) = all(lenc(:)) end do return end subroutine encounter_check_sweep_aabb_one_double_list - pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegxi, iendxi, ibegyi, iendyi, ibegx, iendx, ibegy, iendy, lencounteri) + pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Mutual encounters allowed (e.g. pl-pl) implicit none ! Arguments - integer(I4B), intent(in) :: n !! Number of bodies - integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents - integer(I4B), intent(in) :: ibegxi, iendxi !! The beginning and ending indices of the ith bounding box in the x-dimension - integer(I4B), intent(in) :: ibegyi, iendyi !! The beginning and ending indices of the ith bounding box in the y-dimension - integer(I4B), dimension(:), intent(in) :: ibegx, iendx !! Beginning and ending index lists in the x-dimension - integer(I4B), dimension(:), intent(in) :: ibegy, iendy !! Beginning and ending index lists in the y-dimension - logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body + integer(I4B), intent(in) :: n !! Number of bodies + integer(I4B), dimension(:), intent(in) :: ext_ind !! Sorted index array of extents + integer(I4B), dimension(:), intent(in) :: ibegi, iendi !! The beginning and ending indices of the ith bounding box in the n-dimensions + integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the x-dimension + logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox + integer(I4B) :: j, jbox, dim + logical, dimension(SWEEPDIM) :: lenc lencounteri(:) = .false. - do concurrent(jbox = ibegxi:iendxi) ! Sweep forward until the end of the interval + lenc(1) = .true. + do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval j = ext_ind(jbox) if (j > n) j = j - n ! If this is an endpoint index, shift it to the correct range - ! Check the y-dimension - lencounteri(j) = (iendy(j) > ibegyi) .and. (ibegy(j) < iendyi) + ! Check the other dimensions + do dim = 2, SWEEPDIM + lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) + end do + lencounteri(j) = all(lenc(:)) end do return diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index 8ea7a3026..9cf89faee 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -60,7 +60,7 @@ end subroutine util_sort_body module pure subroutine util_sort_dp(arr) !! author: David A. Minton !! - !! Sort input double precision array in place into ascending numerical order using insertion sort. + !! Sort input DP precision array in place into ascending numerical order using insertion sort. !! This algorithm works well for partially sorted arrays (which is usually the case here) !! implicit none @@ -89,7 +89,7 @@ end subroutine util_sort_dp module pure subroutine util_sort_index_dp(arr, ind) !! author: David A. Minton !! - !! Sort input double precision array by index in ascending numerical order using insertion sort. + !! Sort input DP precision array by index in ascending numerical order using insertion sort. !! This algorithm works well for partially sorted arrays (which is usually the case here). !! If ind is supplied already allocated, we assume it is an existing index array (e.g. a previously !! sorted array). If it is not allocated, this subroutine allocates it. @@ -99,27 +99,86 @@ module pure subroutine util_sort_index_dp(arr, ind) real(DP), dimension(:), intent(in) :: arr integer(I4B), dimension(:), allocatable, intent(inout) :: ind ! Internals - integer(I4B) :: n, i, j, jp, itmp + integer(I4B) :: n, i, j, itmp + real(DP), dimension(:), allocatable :: tmparr n = size(arr) if (.not.allocated(ind)) then allocate(ind(n)) ind = [(i, i=1, n)] end if - do i = 2, n - itmp = ind(i) - j = i - 1 - do while (j >= 1) - if (arr(ind(j)) <= arr(itmp)) exit - ind(j + 1) = ind(j) + allocate(tmparr, source=arr) + call qsort_index_DP(tmparr, ind) + + return + end subroutine util_sort_index_dp + + + recursive pure subroutine qsort_index_DP(arr, ind) + !! author: David A. Minton + !! + !! Sort input DP precision array by index in ascending numerical order using quicksort sort. + implicit none + real(DP), dimension(:), intent(inout) :: arr + integer(I4B),dimension(:),intent(out) :: ind + integer :: iq + + if (size(arr) > 1) then + call partition_DP(arr, iq, ind) + call qsort_index_DP(arr(:iq-1),ind(:iq-1)) + call qsort_index_DP(arr(iq:), ind(iq:)) + end if + + return + end subroutine qsort_index_DP + + + pure subroutine partition_DP(arr, marker, ind) + !! author: David A. Minton + !! + !! Partition function for quicksort on DP type + real(DP), intent(inout), dimension(:) :: arr + integer(I4B), intent(inout), dimension(:), optional :: ind + integer(I4B), intent(out) :: marker + integer(I4B) :: i, j, itmp + real(DP) :: temp + real(DP) :: x ! pivot point + + x = arr(1) + i = 0 + j = size(arr) + 1 + + do + j = j - 1 + do + if (arr(j) <= x) exit j = j - 1 end do - ind(j + 1) = itmp + i = i + 1 + do + if (arr(i) >= x) exit + i = i + 1 + end do + if (i < j) then + ! exchange A(i) and A(j) + temp = arr(i) + arr(i) = arr(j) + arr(j) = temp + itmp = ind(i) + ind(i) = ind(j) + ind(j) = itmp + else if (i == j) then + marker = i + 1 + return + else + marker = i + return + endif end do - + return - end subroutine util_sort_index_dp - + end subroutine Partition_index_DP + module pure subroutine util_sort_i4b(arr) !! author: David A. Minton From c37c73d756ea1c7c15c1106319e7bfc73ee3c4af Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 16:35:04 -0400 Subject: [PATCH 6/9] Fixed typo in subroutine name --- src/util/util_sort.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index 9cf89faee..a662eb52c 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -177,7 +177,7 @@ pure subroutine partition_DP(arr, marker, ind) end do return - end subroutine Partition_index_DP + end subroutine Partition_DP module pure subroutine util_sort_i4b(arr) From 13aa25533e43b92f0cf5d1bdce97f07414afa1a0 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 17:53:05 -0400 Subject: [PATCH 7/9] Improved performance of sweep phase --- src/encounter/encounter_check.f90 | 66 ++++++++++++++++++++++--------- src/util/util_sort.f90 | 31 ++++++++++----- 2 files changed, 69 insertions(+), 28 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 93580eb33..9c5570508 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -971,6 +971,7 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr integer(I4B), dimension(:), allocatable :: ibeg, iend + type(walltimer) :: timer ntot = n1 + n2 call util_index_array(ind_arr, ntot) @@ -982,16 +983,28 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure + call timer%reset() + call timer%start() call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) - + call timer%stop() + write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step + + call timer%reset() + call timer%start() call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) + call timer%stop() + write(*,*) "collapse rag: ",timer%count_stop_step - timer%count_start_step + call timer%reset() + call timer%start() ! Reorder the pairs and sort the first index in order to remove any duplicates do concurrent(k = 1:nenc, index2(k) < index1(k)) i = index2(k) index2(k) = index1(k) index1(k) = i end do + call timer%stop() + write(*,*) "reorder arr : ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_sweep_aabb_double_list @@ -1058,10 +1071,9 @@ subroutine encounter_check_sweep_aabb_all_double_list(n1, n2, ext_ind, ibeg, ien integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi ntot = n1 + n2 - !$omp parallel do default(private) schedule(guided)& + !$omp parallel do default(private) schedule(dynamic)& !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & - !$omp firstprivate(ntot, n1, n2) & - !$omp lastprivate(ibegi, iendi) + !$omp firstprivate(ntot, n1, n2) do i = 1, ntot ibegi(1) = ibeg(1,i) + 1 iendi(1) = iend(1,i) - 1 @@ -1096,10 +1108,10 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, in logical, dimension(n) :: lencounteri integer(I4B), dimension(SWEEPDIM) :: ibegi, iendi - !$omp parallel do default(private) schedule(guided)& + !$omp parallel do default(private) schedule(dynamic)& !$omp shared(ext_ind, ibeg, iend, ind_arr, lenc) & !$omp firstprivate(n) & - !$omp lastprivate(ibegi, iendi) + !$omp lastprivate(ibegi, iendi, lencounteri) do i = 1, n ibegi(1) = ibeg(1,i) + 1 iendi(1) = iend(1,i) - 1 @@ -1118,7 +1130,7 @@ subroutine encounter_check_sweep_aabb_all_single_list(n, ext_ind, ibeg, iend, in end subroutine encounter_check_sweep_aabb_all_single_list - pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) + subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, ibegi, iendi, ibeg, iend, lencounteri) !! author: David A. Minton !! !! Performs a sweep operation on a single body. Encounters from the same lists not allowed (e.g. pl-tp encounters only) @@ -1133,14 +1145,20 @@ pure subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the n-dimensions logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox, dim - logical, dimension(SWEEPDIM) :: lenc + integer(I4B) :: j, jbox, dim, jlo, jhi + logical, dimension(2:SWEEPDIM) :: lenc + integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. - lenc(1) = .true. - do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval - j = ext_ind(jbox) - if (j > ntot) j = j - ntot ! If this is an endpoint index, shift it to the correct range + jlo = ibegi(1) + jhi = iendi(1) + allocate(box(jlo:jhi)) + box(:) = ext_ind(jlo:jhi) + where(box(:) > ntot) + box(:) = box(:) - ntot + endwhere + do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + j = box(jbox) if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed ! Check the y-dimension do dim = 2, SWEEPDIM @@ -1165,14 +1183,24 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie integer(I4B), dimension(:,:), intent(in) :: ibeg, iend !! Beginning and ending index lists in the x-dimension logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals - integer(I4B) :: j, jbox, dim - logical, dimension(SWEEPDIM) :: lenc + integer(I4B) :: j, jbox, dim, jlo, jhi + logical, dimension(2:SWEEPDIM) :: lenc + integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. - lenc(1) = .true. - do concurrent(jbox = ibegi(1):iendi(1)) ! Sweep forward until the end of the interval - j = ext_ind(jbox) - if (j > n) j = j - n ! If this is an endpoint index, shift it to the correct range + + jlo = ibegi(1) + jhi = iendi(1) + + allocate(box(jlo:jhi)) + box(:) = ext_ind(jlo:jhi) + + where(box(:) > n) + box(:) = box(:) - n + endwhere + + do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + j = box(jbox) ! Check the other dimensions do dim = 2, SWEEPDIM lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) diff --git a/src/util/util_sort.f90 b/src/util/util_sort.f90 index a662eb52c..1b09b84f7 100644 --- a/src/util/util_sort.f90 +++ b/src/util/util_sort.f90 @@ -118,9 +118,12 @@ recursive pure subroutine qsort_index_DP(arr, ind) !! author: David A. Minton !! !! Sort input DP precision array by index in ascending numerical order using quicksort sort. + !! implicit none + ! Arguments real(DP), dimension(:), intent(inout) :: arr integer(I4B),dimension(:),intent(out) :: ind + !! Internals integer :: iq if (size(arr) > 1) then @@ -137,16 +140,24 @@ pure subroutine partition_DP(arr, marker, ind) !! author: David A. Minton !! !! Partition function for quicksort on DP type - real(DP), intent(inout), dimension(:) :: arr + !! + implicit none + ! Arguments + real(DP), intent(inout), dimension(:) :: arr integer(I4B), intent(inout), dimension(:), optional :: ind - integer(I4B), intent(out) :: marker - integer(I4B) :: i, j, itmp + integer(I4B), intent(out) :: marker + ! Internals + integer(I4B) :: i, j, itmp, narr, ipiv real(DP) :: temp - real(DP) :: x ! pivot point + real(DP) :: x ! pivot point + + narr = size(arr) - x = arr(1) + ! Get center as pivot, as this is likely partially sorted + ipiv = narr / 2 + x = arr(ipiv) i = 0 - j = size(arr) + 1 + j = narr + 1 do j = j - 1 @@ -164,9 +175,11 @@ pure subroutine partition_DP(arr, marker, ind) temp = arr(i) arr(i) = arr(j) arr(j) = temp - itmp = ind(i) - ind(i) = ind(j) - ind(j) = itmp + if (present(ind)) then + itmp = ind(i) + ind(i) = ind(j) + ind(j) = itmp + end if else if (i == j) then marker = i + 1 return From 4032d6b2d667a7baa922f941a7b6b3fd2d0d1391 Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 22 Oct 2021 22:30:08 -0400 Subject: [PATCH 8/9] streamlined sweep step --- src/encounter/encounter_check.f90 | 25 +++++-------------------- src/modules/encounter_classes.f90 | 2 +- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 9c5570508..e178aa4f6 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -989,22 +989,14 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind call timer%stop() write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) - call timer%stop() - write(*,*) "collapse rag: ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() ! Reorder the pairs and sort the first index in order to remove any duplicates do concurrent(k = 1:nenc, index2(k) < index1(k)) i = index2(k) index2(k) = index1(k) index1(k) = i end do - call timer%stop() - write(*,*) "reorder arr : ",timer%count_stop_step - timer%count_start_step return end subroutine encounter_check_sweep_aabb_double_list @@ -1146,7 +1138,6 @@ subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals integer(I4B) :: j, jbox, dim, jlo, jhi - logical, dimension(2:SWEEPDIM) :: lenc integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. @@ -1157,14 +1148,12 @@ subroutine encounter_check_sweep_aabb_one_double_list(i, n1, n2, ntot, ext_ind, where(box(:) > ntot) box(:) = box(:) - ntot endwhere - do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval + + do concurrent (jbox = jlo:jhi) j = box(jbox) if (((i <= n1) .and. (j <= n1)) .or. ((i > n1) .and. (j > n1))) cycle ! only pairs from the two different lists allowed - ! Check the y-dimension - do dim = 2, SWEEPDIM - lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) - end do - lencounteri(j) = all(lenc(:)) + ! Check the other dimensions + lencounteri(j) = all((iend(2:SWEEPDIM,j) > ibegi(2:SWEEPDIM)) .and. (ibeg(2:SWEEPDIM,j) < iendi(2:SWEEPDIM))) end do return @@ -1184,7 +1173,6 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie logical, dimension(:), intent(out) :: lencounteri !! Encounter list for the ith body ! Internals integer(I4B) :: j, jbox, dim, jlo, jhi - logical, dimension(2:SWEEPDIM) :: lenc integer(I4B), dimension(:), allocatable :: box lencounteri(:) = .false. @@ -1202,10 +1190,7 @@ pure subroutine encounter_check_sweep_aabb_one_single_list(n, ext_ind, ibegi, ie do concurrent(jbox = jlo:jhi) ! Sweep forward until the end of the interval j = box(jbox) ! Check the other dimensions - do dim = 2, SWEEPDIM - lenc(dim) = (iend(dim,j) > ibegi(dim)) .and. (ibeg(dim,j) < iendi(dim)) - end do - lencounteri(j) = all(lenc(:)) + lencounteri(j) = all((iend(2:SWEEPDIM,j) > ibegi(2:SWEEPDIM)) .and. (ibeg(2:SWEEPDIM,j) < iendi(2:SWEEPDIM))) end do return diff --git a/src/modules/encounter_classes.f90 b/src/modules/encounter_classes.f90 index 4965f14f1..1c182619b 100644 --- a/src/modules/encounter_classes.f90 +++ b/src/modules/encounter_classes.f90 @@ -7,7 +7,7 @@ module encounter_classes implicit none public - integer(I4B), parameter :: SWEEPDIM = 2 + integer(I4B), parameter :: SWEEPDIM = 3 type :: encounter_list integer(I4B) :: nenc = 0 !! Total number of encounters From 4f9584890f2ea75a65c30b9f85b6cc499718361a Mon Sep 17 00:00:00 2001 From: David A Minton Date: Fri, 5 Nov 2021 16:57:34 -0400 Subject: [PATCH 9/9] Fixed memory leaks. The main one was due to opening already-open NetCDF files when writing and reading headers --- src/encounter/encounter_check.f90 | 45 ++----------------------------- src/main/swiftest_driver.f90 | 2 ++ src/netcdf/netcdf.f90 | 4 --- src/symba/symba_util.f90 | 9 ++++++- 4 files changed, 12 insertions(+), 48 deletions(-) diff --git a/src/encounter/encounter_check.f90 b/src/encounter/encounter_check.f90 index 2bddc0517..b125d0ff9 100644 --- a/src/encounter/encounter_check.f90 +++ b/src/encounter/encounter_check.f90 @@ -402,8 +402,6 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, npl_last = npl end if - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static) & !$omp shared(x, v, renc, boundingbox) & !$omp firstprivate(dt, npl, n) @@ -419,31 +417,17 @@ subroutine encounter_check_all_sort_and_sweep_plpl(npl, x, v, renc, dt, lvdotr, x(dim,1:npl) + renc(1:npl) + vshift_max(1:npl) * v(dim,1:npl) * dt]) end do !$omp end parallel do - call timer%stop() - write(*,*) "plpl sort : ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call boundingbox%sweep(npl, nenc, index1, index2) - call timer%stop() - write(*,*) "plpl sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Now that we have identified potential pairs, use the narrow-phase process to get the final values allocate(lencounter(nenc)) allocate(lvdotr(nenc)) - call timer%reset() - call timer%start() call encounter_check_all(nenc, index1, index2, x, v, x, v, renc, renc, dt, lencounter, lvdotr) - call timer%stop() - write(*,*) "plpl check : ",timer%count_stop_step - timer%count_start_step - call timer%reset() - call timer%start() call encounter_check_reduce_broadphase(npl, nenc, index1, index2, lencounter, lvdotr) - call timer%stop() - write(*,*) "plpl reduce: ",timer%count_stop_step - timer%count_start_step deallocate(lencounter) end if @@ -520,14 +504,8 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt xplt(dim,1:nplt) + renct(1:nplt) + vpltshift_max(1:nplt) * vplt(dim,1:nplt) * dt]) end do !$omp end parallel do - call timer%stop() - write(*,*) "plplm sort : ",timer%count_stop_step - timer%count_start_step - - call timer%reset() - call timer%start() + call boundingbox%sweep(nplm, nplt, nenc, index1, index2) - call timer%stop() - write(*,*) "plplm sweep : ",timer%count_stop_step - timer%count_start_step if (nenc > 0) then ! Shift tiny body indices back into the range of the input position and velocity arrays @@ -537,15 +515,9 @@ subroutine encounter_check_all_sort_and_sweep_plplm(nplm, nplt, xplm, vplm, xplt allocate(lencounter(nenc)) allocate(lvdotr(nenc)) - call timer%reset() - call timer%start() call encounter_check_all(nenc, index1, index2, xplm, vplm, xplt, vplt, rencm, renct, dt, lencounter, lvdotr) - call timer%stop() - write(*,*) "plplm check : ",timer%count_stop_step - timer%count_start_step call encounter_check_reduce_broadphase(ntot, nenc, index1, index2, lencounter, lvdotr) - call timer%stop() - write(*,*) "plplm reduce: ",timer%count_stop_step - timer%count_start_step end if return end subroutine encounter_check_all_sort_and_sweep_plplm @@ -706,8 +678,6 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde call util_index_array(ind_arr, npl) - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(x, v, renc, lenc, ind_arr) & !$omp firstprivate(npl, dt) @@ -719,8 +689,6 @@ subroutine encounter_check_all_triangular_plpl(npl, x, v, renc, dt, lvdotr, inde renc(i), renc(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do - call timer%stop() - write(*,*) "plpl triang: ",timer%count_stop_step - timer%count_start_step call encounter_check_collapse_ragged_list(lenc, npl, nenc, index1, index2, lvdotr) @@ -758,8 +726,6 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp call util_index_array(ind_arr, nplt) - call timer%reset() - call timer%start() !$omp parallel do default(private) schedule(static)& !$omp shared(xplm, vplm, xplt, vplt, rencm, renct, lenc, ind_arr) & !$omp firstprivate(nplm, nplt, dt) @@ -771,8 +737,6 @@ subroutine encounter_check_all_triangular_plplm(nplm, nplt, xplm, vplm, xplt, vp rencm(i), renct(:), dt, ind_arr(:), lenc(i)) end do !$omp end parallel do - call timer%stop() - write(*,*) "plplm triang: ",timer%count_stop_step - timer%count_start_step call encounter_check_collapse_ragged_list(lenc, nplm, nenc, index1, index2, lvdotr) @@ -991,7 +955,6 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind type(encounter_list), dimension(n1+n2) :: lenc !! Array of encounter lists (one encounter list per body) integer(I4B), dimension(:), allocatable, save :: ind_arr integer(I4B), dimension(:), allocatable :: ibeg, iend - type(walltimer) :: timer ntot = n1 + n2 call util_index_array(ind_arr, ntot) @@ -1005,12 +968,8 @@ module subroutine encounter_check_sweep_aabb_double_list(self, n1, n2, nenc, ind end do ! Sweep the intervals for each of the massive bodies along one dimension ! This will build a ragged pair of index lists inside of the lenc data structure - call timer%reset() - call timer%start() call encounter_check_sweep_aabb_all_double_list(n1, n2, self%aabb(1)%ind(:), reshape(ibeg(:), [SWEEPDIM, ntot]), reshape(iend(:), [SWEEPDIM, ntot]), ind_arr(:), lenc(:)) - call timer%stop() - write(*,*) "sweep double: ",timer%count_stop_step - timer%count_start_step - + call encounter_check_collapse_ragged_list(lenc, ntot, nenc, index1, index2) ! Reorder the pairs and sort the first index in order to remove any duplicates diff --git a/src/main/swiftest_driver.f90 b/src/main/swiftest_driver.f90 index 5fdb43c8b..d4e1f2dc8 100644 --- a/src/main/swiftest_driver.f90 +++ b/src/main/swiftest_driver.f90 @@ -123,6 +123,8 @@ program swiftest_driver end do end associate + call nbody_system%dealloc() + call util_exit(SUCCESS) stop diff --git a/src/netcdf/netcdf.f90 b/src/netcdf/netcdf.f90 index 9ad4e38be..951707477 100644 --- a/src/netcdf/netcdf.f90 +++ b/src/netcdf/netcdf.f90 @@ -650,8 +650,6 @@ module subroutine netcdf_read_hdr_system(self, iu, param) tslot = int(param%ioutput, kind=I4B) + 1 - call check( nf90_open(param%outfile, NF90_NOWRITE, iu%ncid) ) - call check( nf90_get_var(iu%ncid, iu%time_varid, param%t, start=[tslot]) ) call check( nf90_get_var(iu%ncid, iu%npl_varid, self%pl%nbody, start=[tslot]) ) call check( nf90_get_var(iu%ncid, iu%ntp_varid, self%tp%nbody, start=[tslot]) ) @@ -1117,8 +1115,6 @@ module subroutine netcdf_write_hdr_system(self, iu, param) tslot = int(param%ioutput, kind=I4B) + 1 - call check( nf90_open(param%outfile, nf90_write, iu%ncid) ) - call check( nf90_put_var(iu%ncid, iu%time_varid, param%t, start=[tslot]) ) call check( nf90_put_var(iu%ncid, iu%npl_varid, self%pl%nbody, start=[tslot]) ) call check( nf90_put_var(iu%ncid, iu%ntp_varid, self%tp%nbody, start=[tslot]) ) diff --git a/src/symba/symba_util.f90 b/src/symba/symba_util.f90 index f429b74b7..50c3c4e4b 100644 --- a/src/symba/symba_util.f90 +++ b/src/symba/symba_util.f90 @@ -251,6 +251,8 @@ module subroutine symba_util_dealloc_pl(self) implicit none ! Arguments class(symba_pl), intent(inout) :: self !! SyMBA massive body object + ! Internals + integer(I4B) :: i if (allocated(self%lcollision)) deallocate(self%lcollision) if (allocated(self%lencounter)) deallocate(self%lencounter) @@ -262,6 +264,11 @@ module subroutine symba_util_dealloc_pl(self) if (allocated(self%isperi)) deallocate(self%isperi) if (allocated(self%peri)) deallocate(self%peri) if (allocated(self%atp)) deallocate(self%atp) + + do i = 1, self%nbody + call self%kin(i)%dealloc() + end do + if (allocated(self%kin)) deallocate(self%kin) call util_dealloc_pl(self) @@ -408,7 +415,7 @@ module subroutine symba_util_flatten_eucl_plpl(self, param) if (param%lflatten_interactions) then if (allocated(self%k_plpl)) deallocate(self%k_plpl) ! Reset the index array if it's been set previously allocate(self%k_plpl(2, nplpl), stat=err) - if (err /=0) then ! An error occurred trying to allocate this big array. This probably means it's too big to fit in memory, and so we will force the run back into triangular mode + if (err /= 0) then ! An error occurred trying to allocate this big array. This probably means it's too big to fit in memory, and so we will force the run back into triangular mode param%lflatten_interactions = .false. else do concurrent (i=1:npl, j=1:npl, j>i)