! Module containing kernel module counting_kernel use cudafor implicit none contains ! CUDA Kernel ! TODO: Add the kernel specifier for execution on GPU subroutine count_threads(counter, blockDims_, gridDims_, gridDims_, lowest_index_sum, highest_index_sum) type(dim3), managed :: blockDims_ type(dim3), managed :: gridDims_ integer, managed :: counter,lowest_index_sum,highest_index_sum integer :: old_val ! count number of threads old_val = atomicAdd(counter,1) ! counter = counter +1 ! TODO: only the thread with lowest possible sum of thread and block indeces should execute the block if (_____) then blockDims_ = blockDim gridDims_ = gridDim lowest_index_sum = threadIdx%x+threadIdx%y+threadIdx%z+blockIdx%x+blockIdx%y+blockIdx%z end if ! TODO: only the thread with highest possible sum of thread and block indeces should execute the block if(_____) highest_index_sum = threadIdx%x+threadIdx%y+threadIdx%z+blockIdx%x+blockIdx%y+blockIdx%z end if end subroutine end module counting_kernel ! CPU code program main use cudafor use counting_kernel implicit none ! TODO: specify a kernel launch configuration ___ :: grid_definition, thread_block_definition integer :: istat integer, managed :: counter,lowest_index_sum,highest_index_sum ! used to store the actual configuration when running type(dim3), managed :: blockDims_ type(dim3), managed :: gridDims_ counter = 0 lowest_index_sum = -1 highest_index_sum = -1 ! TODO: specify a kernel launch configuration grid_definition = ___ thread_block_definition = ___ call count_threads<<>>(counter, blockDims_, gridDims_, lowest_index_sum, highest_index_sum) istat = cudaGetLastError() if (istat /= cudasuccess) then print *, cudaGetErrorString(istat) end if ! We need to wait until the kernel finishes. istat = cudaDeviceSynchronize() if (istat /= cudasuccess) then print *, cudaGetErrorString(istat) end if if (counter==0) then print *, "Kernel did not work correctly.\n" else 1 format (A,I0,A,I0,A,I0,A,I0,A,A,I0,A,I0,A,I0,A,A,I0,A,I0,A,I0,A,I0,A) print 1, "Program started ", counter, " = ", gridDims_%x*gridDims_%y*gridDims_%z, "x", & blockDims_%x*blockDims_%y*blockDims_%z," thread(s) grouped in ",gridDims_%x*gridDims_%y*gridDims_%z," thread blocks ", & "(grid layout was x=",gridDims_%x,",y=",gridDims_%y,",z=",gridDims_%y,") ", & "with ",blockDims_%x*blockDims_%y*blockDims_%z," threads per block (block had dimensions x=",blockDims_%x,",y=",blockDims_%y,",z=",blockDims_%z,")." end if print '(A,I0)', "This information was written by thread with sum of indices equals ", lowest_index_sum print '(A,I0)', "Thread with highest sum of indices has sum equals ", highest_index_sum ! exit end program main