diff --git a/CMakeLists.txt b/CMakeLists.txt index e0b6f93..2da79b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,7 @@ set (INCLUDE_HEADERS include/ActivationTable.h include/Common.h include/cuNSearch.h + Utils/cuda_helper.h ) set (HEADER_FILES diff --git a/Utils/cuda_helper.cpp b/Utils/cuda_helper.cpp new file mode 100644 index 0000000..2484fad --- /dev/null +++ b/Utils/cuda_helper.cpp @@ -0,0 +1,86 @@ +#include "cuda_helper.h" +#include + +CUDAException::CUDAException(const char *_const_Message) : std::runtime_error(_const_Message) +{ + +} + +CUDAMallocException::CUDAMallocException(const char *_const_Message) : std::runtime_error(_const_Message) +{ + +} + +CUDAMemCopyException::CUDAMemCopyException(const char *_const_Message) : std::runtime_error(_const_Message) +{ + +} + +void CudaHelper::DeviceSynchronize() +{ + cudaError_t cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) + { + auto temp = cudaGetErrorString(cudaStatus); + throw CUDAException(temp); + } +} + +void CudaHelper::GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads) +{ + numberOfThreads = (numberOfElements / alignment) * alignment; + numberOfThreadBlocks = (numberOfElements / alignment); + if (numberOfElements % alignment != 0) + { + numberOfThreads += alignment; + numberOfThreadBlocks++; + } +} + +void CudaHelper::MemcpyHostToDevice(void* host, void* device, size_t size) +{ + cudaError_t cudaStatus = cudaMemcpy(device, host, size, cudaMemcpyHostToDevice); + if (cudaStatus != cudaSuccess) + { + throw CUDAMemCopyException("cudaMemcpy() failed!"); + } +} + +void CudaHelper::MemcpyDeviceToHost(void* device, void* host, size_t size) +{ + cudaError_t cudaStatus = cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost); + if (cudaStatus != cudaSuccess) + { + throw CUDAMemCopyException("cudaMemcpy() failed!"); + } +} + +void CudaHelper::CheckLastError() +{ + cudaError_t cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) + { + auto temp = cudaGetErrorString(cudaStatus); + throw CUDAException(temp); + } +} + +void CudaHelper::CudaMalloc(void** src, size_t size) +{ + cudaError_t cudaStatus = cudaMalloc(src, size); + if (cudaStatus != cudaSuccess) + { + printf("Error in CudaMalloc: %s : ", cudaGetErrorString(cudaStatus)); + throw cudaErrorMemoryAllocation; + } +} + +void CudaHelper::CudaFree(void* src) +{ + cudaError_t cudaStatus = cudaFree(src); + if (cudaStatus != cudaSuccess) + { + printf("Error in CudaFree: %s : ", cudaGetErrorString(cudaStatus)); + throw cudaErrorMemoryAllocation; + } +} \ No newline at end of file diff --git a/Utils/cuda_helper.cu b/Utils/cuda_helper.cu index ec60f2d..bdb9955 100644 --- a/Utils/cuda_helper.cu +++ b/Utils/cuda_helper.cu @@ -1,4 +1,4 @@ -#include "cuda_helper.h" +#include "cuda_helper.cuh" #include CUDAException::CUDAException(const char *_const_Message) : std::runtime_error(_const_Message) @@ -42,7 +42,7 @@ void CudaHelper::MemcpyHostToDevice(void* host, void* device, size_t size) cudaError_t cudaStatus = cudaMemcpy(device, host, size, cudaMemcpyHostToDevice); if (cudaStatus != cudaSuccess) { - throw CUDAMemCopyException("cudaMemcpy() failed!"); + throw CUDAMemCopyException("cudaMemcpy() from host to device failed!"); } } @@ -51,7 +51,7 @@ void CudaHelper::MemcpyDeviceToHost(void* device, void* host, size_t size) cudaError_t cudaStatus = cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost); if (cudaStatus != cudaSuccess) { - throw CUDAMemCopyException("cudaMemcpy() failed!"); + throw CUDAMemCopyException("cudaMemcpy() from device to host failed!"); } } @@ -63,4 +63,24 @@ void CudaHelper::CheckLastError() auto temp = cudaGetErrorString(cudaStatus); throw CUDAException(temp); } -} \ No newline at end of file +} + +void CudaHelper::CudaMalloc(void** src, size_t size) +{ + cudaError_t cudaStatus = cudaMalloc(src, size); + if (cudaStatus != cudaSuccess) + { + printf("Error: %s : ", cudaGetErrorString(cudaStatus)); + throw cudaErrorMemoryAllocation; + } +} + +void CudaHelper::CudaFree(void* src) +{ + cudaError_t cudaStatus = cudaFree(src); + if (cudaStatus != cudaSuccess) + { + printf("Error in CudaFree: %s : ", cudaGetErrorString(cudaStatus)); + throw cudaErrorMemoryAllocation; + } +} diff --git a/Utils/cuda_helper.cuh b/Utils/cuda_helper.cuh new file mode 100644 index 0000000..531292d --- /dev/null +++ b/Utils/cuda_helper.cuh @@ -0,0 +1,97 @@ +#pragma once +#include + +class CUDAException : public std::runtime_error +{ +public: + CUDAException(const char *_const_Message); +}; + +class CUDAMallocException : public std::runtime_error +{ +public: + CUDAMallocException(const char *_const_Message); +}; + +class CUDAMemCopyException : public std::runtime_error +{ +public: + CUDAMemCopyException(const char *const_Message); +}; + +/*static*/ class CudaHelper +{ +public: + /** Synchronizes the device work with the current thread and throws any errors as exception. + */ + static void DeviceSynchronize(); + + + /** Throws the last error as exception. + */ + static void CheckLastError(); + + static void GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads); + + + /** Gets a raw pointer from a thrust vector + */ + template + static T* GetPointer(thrust::device_vector &vector) + { + return thrust::raw_pointer_cast(&vector[0]); + } + + /** Gets the size of the device_vector data in bytes. + */ + template + static size_t GetSizeInBytes(const thrust::device_vector &vector) + { + return sizeof(T) * vector.size(); + } + + /** Copies data from host to device. + */ + static void MemcpyHostToDevice(void* host, void* device, size_t size); + + /** Copies data from host to device. + */ + template + static void MemcpyHostToDevice(T* host, T* device, size_t elements) + { + MemcpyHostToDevice((void*)host, (void*)device, elements * sizeof(T)); + } + + /** Copies data from device to host. + */ + static void MemcpyDeviceToHost(void* device, void* host, size_t size); + + /** Copies data from device to host. + */ + template + static void MemcpyDeviceToHost(T* device, T* host, size_t elements) + { + MemcpyDeviceToHost((void*)device, (void*)host, elements * sizeof(T)); + } + + static void CudaMalloc(void** src, size_t size); + + /** Reserve memory for data structures on device. + */ + template + static void CudaMalloc(T** src, size_t elements) + { + CudaMalloc((void**)src, elements * sizeof(T)); + } + + static void CudaFree(void* src); + + /** Reserve memory for data structures on device. + */ + template + static void CudaFree(T* src) + { + CudaFree((void*)src); + } + +}; \ No newline at end of file diff --git a/Utils/cuda_helper.h b/Utils/cuda_helper.h index 4a98572..531292d 100644 --- a/Utils/cuda_helper.h +++ b/Utils/cuda_helper.h @@ -74,4 +74,24 @@ class CUDAMemCopyException : public std::runtime_error MemcpyDeviceToHost((void*)device, (void*)host, elements * sizeof(T)); } + static void CudaMalloc(void** src, size_t size); + + /** Reserve memory for data structures on device. + */ + template + static void CudaMalloc(T** src, size_t elements) + { + CudaMalloc((void**)src, elements * sizeof(T)); + } + + static void CudaFree(void* src); + + /** Reserve memory for data structures on device. + */ + template + static void CudaFree(T* src) + { + CudaFree((void*)src); + } + }; \ No newline at end of file diff --git a/include/PointSet.h b/include/PointSet.h index 9d97a7f..11a438e 100644 --- a/include/PointSet.h +++ b/include/PointSet.h @@ -5,6 +5,7 @@ #include #include #include +#include "cuda_helper.h" #include "Common.h" @@ -35,6 +36,13 @@ class PointSet uint *Offsets; uint *Neighbors; + //#ifdef GPU_NEIGHBORHOOD_SEARCH + // Same data as in pinned memory. Used to avoid unnecessary data copies in the SPliSHSPlaSH-GPU implementation + uint *d_Counts; + uint *d_Offsets; + uint *d_Neighbors; + //#endif + NeighborSet() { NeighborCountAllocationSize = 0u; @@ -42,6 +50,17 @@ class PointSet Counts = nullptr; Offsets = nullptr; Neighbors = nullptr; + + d_Counts = nullptr; + d_Offsets = nullptr; + d_Neighbors = nullptr; + } + + ~NeighborSet() + { + CudaHelper::CudaFree(d_Counts); + CudaHelper::CudaFree(d_Offsets); + CudaHelper::CudaFree(d_Neighbors); } }; @@ -80,6 +99,31 @@ class PointSet return neighborSet.Neighbors[neighborSet.Offsets[i] + k]; } + inline uint n_neighborsets() + { + return neighbors.size(); + } + + inline uint* neighbor_indices(const uint i) + { + return neighbors[i].d_Neighbors; + } + + inline uint* neighbor_counts(const uint i) + { + return neighbors[i].d_Counts; + } + + inline uint* neighbor_offsets(const uint i) + { + return neighbors[i].d_Offsets; + } + + PointSetImplementation *getPointSetImplementation() + { + return impl.get(); + } + /** * Fetches pointer to neighbors of point i in the given point set. * @param point_set Point set index of other point set where neighbors have been searched. diff --git a/src/PointSetImplementation.h b/src/PointSetImplementation.h index c4d0792..db07778 100644 --- a/src/PointSetImplementation.h +++ b/src/PointSetImplementation.h @@ -31,6 +31,21 @@ namespace cuNSearch void copyToDevice(); + thrust::device_vector &getParticles() + { + return d_Particles; + } + + int getThreadsPerBlock() + { + return ThreadsPerBlock; + } + + uint getNumberOfBlocks() + { + return BlockStartsForParticles; + } + private: friend NeighborhoodSearch; friend cuNSearchDeviceData; @@ -59,4 +74,4 @@ namespace cuNSearch void prepareInternalDataStructures(GridInfo &gridInfo, size_t numberOfCells); }; -}; \ No newline at end of file +}; diff --git a/src/cuNSearchDeviceData.cu b/src/cuNSearchDeviceData.cu index 3e6bdc0..70e4ecb 100644 --- a/src/cuNSearchDeviceData.cu +++ b/src/cuNSearchDeviceData.cu @@ -19,7 +19,7 @@ #include "PointSetImplementation.h" #include "GridInfo.h" -#include "cuda_helper.h" +#include "cuda_helper.cuh" #include "cuNSearchKernels.cuh" namespace cuNSearch @@ -178,13 +178,17 @@ namespace cuNSearch if (queryPointSet.n_points() == 0) return; + auto &neighborSet = queryPointSet.neighbors[neighborListEntry]; auto queryPointSetImpl = queryPointSet.impl.get(); auto pointSetImpl = pointSet.impl.get(); uint particleCount = static_cast(queryPointSet.n_points()); USE_TIMING(Timing::startTiming("Execute kNeighborCount")); - d_NeighborCounts.resize(particleCount); + + uint* &d_NeighborCounts = neighborSet.d_Counts; + CudaHelper::CudaFree(d_NeighborCounts); + CudaHelper::CudaMalloc(&d_NeighborCounts, particleCount); kComputeCounts << BlockStartsForParticles, queryPointSetImpl->ThreadsPerBlock >> > ( (Real3*)CudaHelper::GetPointer(queryPointSetImpl->d_Particles), @@ -195,7 +199,7 @@ namespace cuNSearch CudaHelper::GetPointer(pointSetImpl->d_CellOffsets), CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), - CudaHelper::GetPointer(d_NeighborCounts), + d_NeighborCounts, CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices) ); @@ -205,23 +209,28 @@ namespace cuNSearch USE_TIMING(Timing::stopTiming(PRINT_STATS)); USE_TIMING(Timing::startTiming("Execute exclusive_scan over counts")); - d_NeighborWriteOffsets.resize(particleCount); + uint* &d_NeighborWriteOffsets = neighborSet.d_Offsets; + CudaHelper::CudaFree(d_NeighborWriteOffsets); + CudaHelper::CudaMalloc(&d_NeighborWriteOffsets, particleCount); //Prefix sum over neighbor counts thrust::exclusive_scan( - d_NeighborCounts.begin(), - d_NeighborCounts.end(), - d_NeighborWriteOffsets.begin()); + thrust::device_ptr(d_NeighborCounts), + thrust::device_ptr(d_NeighborCounts) + particleCount, + thrust::device_ptr(d_NeighborWriteOffsets)); CudaHelper::DeviceSynchronize(); //Compute total amount of neighbors uint lastOffset = 0; - CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets) + particleCount - 1, &lastOffset, 1); + CudaHelper::MemcpyDeviceToHost( d_NeighborWriteOffsets + particleCount - 1, &lastOffset, 1); uint lastParticleNeighborCount = 0; - CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts) + particleCount - 1, &lastParticleNeighborCount, 1); + CudaHelper::MemcpyDeviceToHost( d_NeighborCounts + particleCount - 1, &lastParticleNeighborCount, 1); uint totalNeighborCount = lastOffset + lastParticleNeighborCount; - d_Neighbors.resize(totalNeighborCount); + + uint* &d_Neighbors = neighborSet.d_Neighbors; + CudaHelper::CudaFree(d_Neighbors); + CudaHelper::CudaMalloc(&d_Neighbors, totalNeighborCount); CudaHelper::DeviceSynchronize(); @@ -237,8 +246,7 @@ namespace cuNSearch CudaHelper::GetPointer(pointSetImpl->d_CellOffsets), CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts), - CudaHelper::GetPointer(d_NeighborWriteOffsets), - CudaHelper::GetPointer(d_Neighbors), + d_NeighborWriteOffsets, d_Neighbors, CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices) ); @@ -249,8 +257,6 @@ namespace cuNSearch //Copy data to host USE_TIMING(Timing::startTiming("Neighbor copy from device to host - resize")); - auto &neighborSet = queryPointSet.neighbors[neighborListEntry]; - if (neighborSet.NeighborCountAllocationSize < totalNeighborCount) { if (neighborSet.NeighborCountAllocationSize != 0) @@ -285,9 +291,9 @@ namespace cuNSearch printf("Expected amount: %f MB \n", bytesToCopy / (1024.0f * 1024.0f)); } - CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_Neighbors), neighborSet.Neighbors, totalNeighborCount); - CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts), neighborSet.Counts, particleCount); - CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets), neighborSet.Offsets, particleCount); + CudaHelper::MemcpyDeviceToHost( d_Neighbors, neighborSet.Neighbors, totalNeighborCount); + CudaHelper::MemcpyDeviceToHost( d_NeighborCounts, neighborSet.Counts, particleCount); + CudaHelper::MemcpyDeviceToHost( d_NeighborWriteOffsets, neighborSet.Offsets, particleCount); USE_TIMING(Timing::stopTiming(PRINT_STATS)); } diff --git a/src/cuNSearchDeviceData.h b/src/cuNSearchDeviceData.h index e804971..868e58b 100644 --- a/src/cuNSearchDeviceData.h +++ b/src/cuNSearchDeviceData.h @@ -38,9 +38,9 @@ namespace cuNSearch thrust::device_vector d_MinMax; thrust::device_vector d_TempSortIndices; - //Device neighbor buffers (only temporary used: after the computation the data is copied to the host) - thrust::device_vector d_Neighbors; - thrust::device_vector d_NeighborCounts; - thrust::device_vector d_NeighborWriteOffsets; + // only temporary used. After neighborhood computation pointer is handed over to point set +/* uint* &d_Neighbors; + uint* &d_NeighborCounts; + uint* &d_NeighborWriteOffsets; */ }; }; \ No newline at end of file