diff --git a/CMakeLists.txt b/CMakeLists.txt
index e0b6f93..2da79b8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ set (INCLUDE_HEADERS
 	include/ActivationTable.h
 	include/Common.h
 	include/cuNSearch.h
+	Utils/cuda_helper.h
 )
 
 set (HEADER_FILES
diff --git a/Utils/cuda_helper.cpp b/Utils/cuda_helper.cpp
new file mode 100644
index 0000000..2484fad
--- /dev/null
+++ b/Utils/cuda_helper.cpp
@@ -0,0 +1,86 @@
+#include "cuda_helper.h"
+#include <cuda_runtime.h>
+
+CUDAException::CUDAException(const char *_const_Message) : std::runtime_error(_const_Message)
+{
+
+}
+
+CUDAMallocException::CUDAMallocException(const char *_const_Message) : std::runtime_error(_const_Message)
+{
+
+}
+
+CUDAMemCopyException::CUDAMemCopyException(const char *_const_Message) : std::runtime_error(_const_Message)
+{
+
+}
+
+void CudaHelper::DeviceSynchronize()
+{
+	cudaError_t cudaStatus = cudaDeviceSynchronize();
+	if (cudaStatus != cudaSuccess)
+	{
+		auto temp = cudaGetErrorString(cudaStatus);
+		throw CUDAException(temp);
+	}
+}
+
+void CudaHelper::GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads)
+{
+	numberOfThreads = (numberOfElements / alignment) * alignment;
+	numberOfThreadBlocks = (numberOfElements / alignment);
+	if (numberOfElements % alignment != 0)
+	{
+		numberOfThreads += alignment;
+		numberOfThreadBlocks++;
+	}
+}
+
+void CudaHelper::MemcpyHostToDevice(void* host, void* device, size_t size)
+{
+	cudaError_t cudaStatus = cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
+	if (cudaStatus != cudaSuccess)
+	{
+		throw CUDAMemCopyException("cudaMemcpy() failed!");
+	}
+}
+
+void CudaHelper::MemcpyDeviceToHost(void* device, void* host, size_t size)
+{
+	cudaError_t cudaStatus = cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
+	if (cudaStatus != cudaSuccess)
+	{
+		throw CUDAMemCopyException("cudaMemcpy() failed!");
+	}
+}
+
+void CudaHelper::CheckLastError()
+{
+	cudaError_t cudaStatus = cudaGetLastError();
+	if (cudaStatus != cudaSuccess)
+	{
+		auto temp = cudaGetErrorString(cudaStatus);
+		throw CUDAException(temp);
+	}
+}
+
+void CudaHelper::CudaMalloc(void** src, size_t size)
+{
+	cudaError_t cudaStatus = cudaMalloc(src, size);
+	if (cudaStatus != cudaSuccess)
+	{
+		printf("Error in CudaMalloc: %s : ", cudaGetErrorString(cudaStatus));
+		throw cudaErrorMemoryAllocation;
+	}
+}
+
+void CudaHelper::CudaFree(void* src)
+{
+	cudaError_t cudaStatus = cudaFree(src);
+	if (cudaStatus != cudaSuccess)
+	{
+		printf("Error in CudaFree: %s : ", cudaGetErrorString(cudaStatus));
+		throw cudaErrorMemoryAllocation;
+	}
+}
\ No newline at end of file
diff --git a/Utils/cuda_helper.cu b/Utils/cuda_helper.cu
index ec60f2d..bdb9955 100644
--- a/Utils/cuda_helper.cu
+++ b/Utils/cuda_helper.cu
@@ -1,4 +1,4 @@
-#include "cuda_helper.h"
+#include "cuda_helper.cuh"
 #include <cuda_runtime.h>
 
 CUDAException::CUDAException(const char *_const_Message) : std::runtime_error(_const_Message)
@@ -42,7 +42,7 @@ void CudaHelper::MemcpyHostToDevice(void* host, void* device, size_t size)
 	cudaError_t cudaStatus = cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
 	if (cudaStatus != cudaSuccess)
 	{
-		throw CUDAMemCopyException("cudaMemcpy() failed!");
+		throw CUDAMemCopyException("cudaMemcpy() from host to device failed!");
 	}
 }
 
@@ -51,7 +51,7 @@ void CudaHelper::MemcpyDeviceToHost(void* device, void* host, size_t size)
 	cudaError_t cudaStatus = cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
 	if (cudaStatus != cudaSuccess)
 	{
-		throw CUDAMemCopyException("cudaMemcpy() failed!");
+		throw CUDAMemCopyException("cudaMemcpy() from device to host failed!");
 	}
 }
 
@@ -63,4 +63,24 @@ void CudaHelper::CheckLastError()
 		auto temp = cudaGetErrorString(cudaStatus);
 		throw CUDAException(temp);
 	}
-}
\ No newline at end of file
+}
+
+void CudaHelper::CudaMalloc(void** src, size_t size)
+{
+	cudaError_t cudaStatus = cudaMalloc(src, size);
+	if (cudaStatus != cudaSuccess)
+	{
+		printf("Error: %s : ", cudaGetErrorString(cudaStatus));
+		throw cudaErrorMemoryAllocation;
+	}
+}
+
+void CudaHelper::CudaFree(void* src)
+{
+	cudaError_t cudaStatus = cudaFree(src);
+	if (cudaStatus != cudaSuccess)
+	{
+		printf("Error in CudaFree: %s : ", cudaGetErrorString(cudaStatus));
+		throw cudaErrorMemoryAllocation;
+	}
+}
diff --git a/Utils/cuda_helper.cuh b/Utils/cuda_helper.cuh
new file mode 100644
index 0000000..531292d
--- /dev/null
+++ b/Utils/cuda_helper.cuh
@@ -0,0 +1,97 @@
+#pragma once
+#include <thrust/device_vector.h>
+
+class CUDAException : public std::runtime_error
+{
+public:
+	CUDAException(const char *_const_Message);
+};
+
+class CUDAMallocException : public std::runtime_error
+{
+public:
+	CUDAMallocException(const char *_const_Message);
+};
+
+class CUDAMemCopyException : public std::runtime_error
+{
+public:
+	CUDAMemCopyException(const char *const_Message);
+};
+
+/*static*/ class CudaHelper
+{
+public:
+	/** Synchronizes the device work with the current thread and throws any errors as exception.
+	*/
+	static void DeviceSynchronize();
+
+
+	/** Throws the last error as exception.
+	*/
+	static void CheckLastError();
+
+	static void GetThreadBlocks(unsigned int numberOfElements, unsigned int alignment, /*out*/ unsigned int &numberOfThreadBlocks, /*out*/ unsigned int &numberOfThreads);
+
+
+	/** Gets a raw pointer from a thrust vector
+	*/
+	template<typename T>
+	static T* GetPointer(thrust::device_vector<T> &vector)
+	{
+		return thrust::raw_pointer_cast(&vector[0]);
+	}
+
+	/** Gets the size of the device_vector data in bytes.
+	*/
+	template<typename T>
+	static size_t GetSizeInBytes(const thrust::device_vector<T> &vector)
+	{
+		return sizeof(T) * vector.size();
+	}
+
+	/** Copies data from host to device.
+	*/
+	static void MemcpyHostToDevice(void* host, void* device, size_t size);
+
+	/** Copies data from host to device.
+	*/
+	template<typename T>
+	static void MemcpyHostToDevice(T* host, T* device, size_t elements)
+	{
+		MemcpyHostToDevice((void*)host, (void*)device, elements * sizeof(T));
+	}
+
+	/** Copies data from device to host.
+	*/
+	static void MemcpyDeviceToHost(void* device, void* host, size_t size);
+
+	/** Copies data from device to host.
+	*/
+	template<typename T>
+	static void MemcpyDeviceToHost(T* device, T* host, size_t elements)
+	{
+		MemcpyDeviceToHost((void*)device, (void*)host, elements * sizeof(T));
+	}
+
+	static void CudaMalloc(void** src, size_t size);
+
+	/** Reserve memory for data structures on device.
+	 */ 
+	template<typename T>
+	static void CudaMalloc(T** src, size_t elements)
+	{
+		CudaMalloc((void**)src, elements * sizeof(T));
+	}
+
+	static void CudaFree(void* src);
+
+	/** Reserve memory for data structures on device.
+	 */ 
+	template<typename T>
+	static void CudaFree(T* src)
+	{
+		CudaFree((void*)src);
+	}
+
+};
\ No newline at end of file
diff --git a/Utils/cuda_helper.h b/Utils/cuda_helper.h
index 4a98572..531292d 100644
--- a/Utils/cuda_helper.h
+++ b/Utils/cuda_helper.h
@@ -74,4 +74,24 @@ class CUDAMemCopyException : public std::runtime_error
 		MemcpyDeviceToHost((void*)device, (void*)host, elements * sizeof(T));
 	}
 
+	static void CudaMalloc(void** src, size_t size);
+
+	/** Reserve memory for data structures on device.
+	 */ 
+	template<typename T>
+	static void CudaMalloc(T** src, size_t elements)
+	{
+		CudaMalloc((void**)src, elements * sizeof(T));
+	}
+
+	static void CudaFree(void* src);
+
+	/** Reserve memory for data structures on device.
+	 */ 
+	template<typename T>
+	static void CudaFree(T* src)
+	{
+		CudaFree((void*)src);
+	}
+
 };
\ No newline at end of file
diff --git a/include/PointSet.h b/include/PointSet.h
index 9d97a7f..11a438e 100644
--- a/include/PointSet.h
+++ b/include/PointSet.h
@@ -5,6 +5,7 @@
 #include <iostream>
 #include <algorithm>
 #include <memory>
+#include "cuda_helper.h"
 
 #include "Common.h"
 
@@ -35,6 +36,13 @@ class PointSet
 		uint *Offsets;
 		uint *Neighbors;
 
+		//#ifdef GPU_NEIGHBORHOOD_SEARCH
+		// Same data as in pinned memory. Used to avoid unnecessary data copies in the SPliSHSPlaSH-GPU implementation
+		uint *d_Counts;
+		uint *d_Offsets;
+		uint *d_Neighbors;
+		//#endif
+
 		NeighborSet()
 		{
 			NeighborCountAllocationSize = 0u;
@@ -42,6 +50,17 @@ class PointSet
 			Counts = nullptr;
 			Offsets = nullptr;
 			Neighbors = nullptr;
+
+			d_Counts = nullptr;
+			d_Offsets = nullptr;
+			d_Neighbors = nullptr;
+		}
+
+		~NeighborSet()
+		{
+			CudaHelper::CudaFree(d_Counts);
+			CudaHelper::CudaFree(d_Offsets);
+			CudaHelper::CudaFree(d_Neighbors);
 		}
 	};
 
@@ -80,6 +99,31 @@ class PointSet
 		return neighborSet.Neighbors[neighborSet.Offsets[i] + k];
 	}
 
+	inline uint n_neighborsets()
+	{
+		return neighbors.size();
+	}
+
+	inline uint* neighbor_indices(const uint i)
+	{
+		return neighbors[i].d_Neighbors;
+	}
+
+	inline uint* neighbor_counts(const uint i)
+	{
+		return neighbors[i].d_Counts;
+	}
+
+	inline uint* neighbor_offsets(const uint i)
+	{
+		return neighbors[i].d_Offsets;
+	}
+
+	PointSetImplementation *getPointSetImplementation()
+	{
+		return impl.get();
+	}
+
 	/**
 	* Fetches pointer to neighbors of point i in the given point set.
 	* @param point_set Point set index of other point set where neighbors have been searched.
diff --git a/src/PointSetImplementation.h b/src/PointSetImplementation.h
index c4d0792..db07778 100644
--- a/src/PointSetImplementation.h
+++ b/src/PointSetImplementation.h
@@ -31,6 +31,21 @@ namespace cuNSearch
 
 		void copyToDevice();
 
+		thrust::device_vector<Real3> &getParticles()
+		{
+			return d_Particles;
+		}
+
+		int getThreadsPerBlock()
+		{
+			return ThreadsPerBlock;
+		}
+
+		uint getNumberOfBlocks()
+		{
+			return BlockStartsForParticles;
+		}
+
 	private:
 		friend NeighborhoodSearch;
 		friend cuNSearchDeviceData;
@@ -59,4 +74,4 @@ namespace cuNSearch
 
 		void prepareInternalDataStructures(GridInfo &gridInfo, size_t numberOfCells);
 	};
-};
\ No newline at end of file
+};
diff --git a/src/cuNSearchDeviceData.cu b/src/cuNSearchDeviceData.cu
index 3e6bdc0..70e4ecb 100644
--- a/src/cuNSearchDeviceData.cu
+++ b/src/cuNSearchDeviceData.cu
@@ -19,7 +19,7 @@
 
 #include "PointSetImplementation.h"
 #include "GridInfo.h"
-#include "cuda_helper.h"
+#include "cuda_helper.cuh"
 #include "cuNSearchKernels.cuh"
 
 namespace cuNSearch
@@ -178,13 +178,17 @@ namespace cuNSearch
 		if (queryPointSet.n_points() == 0)
 			return;
 	
+		auto &neighborSet = queryPointSet.neighbors[neighborListEntry];
 		auto queryPointSetImpl = queryPointSet.impl.get();
 		auto pointSetImpl = pointSet.impl.get();
 
 		uint particleCount = static_cast<uint>(queryPointSet.n_points());
 
 		USE_TIMING(Timing::startTiming("Execute kNeighborCount"));
-		d_NeighborCounts.resize(particleCount);
+
+		uint* &d_NeighborCounts = neighborSet.d_Counts;
+		CudaHelper::CudaFree(d_NeighborCounts);
+		CudaHelper::CudaMalloc(&d_NeighborCounts, particleCount);
 
 		kComputeCounts << <queryPointSetImpl->BlockStartsForParticles, queryPointSetImpl->ThreadsPerBlock >> > (
 			(Real3*)CudaHelper::GetPointer(queryPointSetImpl->d_Particles),
@@ -195,7 +199,7 @@ namespace cuNSearch
 			CudaHelper::GetPointer(pointSetImpl->d_CellOffsets),
 			CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts),
 
-			CudaHelper::GetPointer(d_NeighborCounts),
+			d_NeighborCounts,
 			CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices)
 			);
 
@@ -205,23 +209,28 @@ namespace cuNSearch
 		USE_TIMING(Timing::stopTiming(PRINT_STATS));
 		USE_TIMING(Timing::startTiming("Execute exclusive_scan over counts"));
 
-		d_NeighborWriteOffsets.resize(particleCount);
+		uint* &d_NeighborWriteOffsets = neighborSet.d_Offsets;
+		CudaHelper::CudaFree(d_NeighborWriteOffsets);
+		CudaHelper::CudaMalloc(&d_NeighborWriteOffsets, particleCount);
 
 		//Prefix sum over neighbor counts
 		thrust::exclusive_scan(
-			d_NeighborCounts.begin(),
-			d_NeighborCounts.end(),
-			d_NeighborWriteOffsets.begin());
+			thrust::device_ptr<uint>(d_NeighborCounts),
+			thrust::device_ptr<uint>(d_NeighborCounts) + particleCount,
+			thrust::device_ptr<uint>(d_NeighborWriteOffsets));
 
 		CudaHelper::DeviceSynchronize();
 
 		//Compute total amount of neighbors
 		uint lastOffset = 0;
-		CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets) + particleCount - 1, &lastOffset, 1);
+		CudaHelper::MemcpyDeviceToHost( d_NeighborWriteOffsets + particleCount - 1, &lastOffset, 1);
 		uint lastParticleNeighborCount = 0;
-		CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts) + particleCount - 1, &lastParticleNeighborCount, 1);
+		CudaHelper::MemcpyDeviceToHost( d_NeighborCounts + particleCount - 1, &lastParticleNeighborCount, 1);
 		uint totalNeighborCount = lastOffset + lastParticleNeighborCount;
-		d_Neighbors.resize(totalNeighborCount);
+
+		uint* &d_Neighbors = neighborSet.d_Neighbors;
+		CudaHelper::CudaFree(d_Neighbors);
+		CudaHelper::CudaMalloc(&d_Neighbors, totalNeighborCount);
 
 		CudaHelper::DeviceSynchronize();
 
@@ -237,8 +246,7 @@ namespace cuNSearch
 			CudaHelper::GetPointer(pointSetImpl->d_CellOffsets),
 			CudaHelper::GetPointer(pointSetImpl->d_CellParticleCounts),
 
-			CudaHelper::GetPointer(d_NeighborWriteOffsets),
-			CudaHelper::GetPointer(d_Neighbors),
+			d_NeighborWriteOffsets, d_Neighbors,
 			CudaHelper::GetPointer(pointSetImpl->d_ReversedSortIndices)
 			);
 
@@ -249,8 +257,6 @@ namespace cuNSearch
 		//Copy data to host
 		USE_TIMING(Timing::startTiming("Neighbor copy from device to host - resize"));
 
-		auto &neighborSet = queryPointSet.neighbors[neighborListEntry];
-
 		if (neighborSet.NeighborCountAllocationSize < totalNeighborCount)
 		{
 			if (neighborSet.NeighborCountAllocationSize != 0)
@@ -285,9 +291,9 @@ namespace cuNSearch
 			printf("Expected amount: %f MB \n", bytesToCopy / (1024.0f * 1024.0f));
 		}
 
-		CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_Neighbors), neighborSet.Neighbors, totalNeighborCount);
-		CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborCounts), neighborSet.Counts, particleCount);
-		CudaHelper::MemcpyDeviceToHost(CudaHelper::GetPointer(d_NeighborWriteOffsets), neighborSet.Offsets, particleCount);
+		CudaHelper::MemcpyDeviceToHost( d_Neighbors, neighborSet.Neighbors, totalNeighborCount);
+		CudaHelper::MemcpyDeviceToHost( d_NeighborCounts, neighborSet.Counts, particleCount);
+		CudaHelper::MemcpyDeviceToHost( d_NeighborWriteOffsets, neighborSet.Offsets, particleCount);
 
 		USE_TIMING(Timing::stopTiming(PRINT_STATS));
 	}
diff --git a/src/cuNSearchDeviceData.h b/src/cuNSearchDeviceData.h
index e804971..868e58b 100644
--- a/src/cuNSearchDeviceData.h
+++ b/src/cuNSearchDeviceData.h
@@ -38,9 +38,9 @@ namespace cuNSearch
 		thrust::device_vector<Int3> d_MinMax;
 		thrust::device_vector<uint> d_TempSortIndices;
 
-		//Device neighbor buffers (only temporary used: after the computation the data is copied to the host)
-		thrust::device_vector<uint> d_Neighbors;
-		thrust::device_vector<uint> d_NeighborCounts;
-		thrust::device_vector<uint> d_NeighborWriteOffsets;
+		// only temporary used. After neighborhood computation pointer is handed over to point set
+/* 		uint* &d_Neighbors;
+		uint* &d_NeighborCounts;
+		uint* &d_NeighborWriteOffsets; */
 	};
 };
\ No newline at end of file