-
Notifications
You must be signed in to change notification settings - Fork 109
Priority Queue #105
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Priority Queue #105
Changes from 26 commits
5ab856e
1f2092c
6a9dc99
6b263e3
0eaaedf
249165c
c28a5ad
e8a9c1e
012ebde
8cf681a
8485bec
da608cc
8a11b7f
d1392b9
9ee6c8b
e223598
dd8c6b7
d031519
ba3a6fd
16db085
052cec0
a11bea5
e3c4a27
f6fa484
599067f
44db340
acfdf7e
d870e29
71775b6
9838569
aab4ba0
0196bde
4af61ca
a1d074a
bf930dd
2d9bda9
54dc9f3
a5c169d
4269e9c
30cbf83
bec63f3
aa12404
55cf2e6
f4814db
89eea18
7d47200
007316a
192e263
66dd359
9da822f
0cfdd94
828b00b
1932418
7c4b1f6
838e4ea
d58dd9f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| #include <vector> | ||
| #include <cstdint> | ||
| #include <random> | ||
|
|
||
| #include <benchmark/benchmark.h> | ||
|
|
||
| #include <cuco/priority_queue.cuh> | ||
| #include <cuco/detail/pair.cuh> | ||
|
|
||
| #include <thrust/device_vector.h> | ||
|
|
||
| using namespace cuco; | ||
|
|
||
| template <typename T> | ||
| struct pair_less { | ||
| __host__ __device__ bool operator()(const T& a, const T& b) const { | ||
| return a.first < b.first; | ||
| } | ||
| }; | ||
|
|
||
| template<typename Key, typename Value, typename OutputIt> | ||
| static void generate_keys_uniform(OutputIt output_begin, OutputIt output_end) { | ||
| std::random_device rd; | ||
| std::mt19937 gen{rd()}; | ||
|
|
||
| auto num_keys = std::distance(output_begin, output_end); | ||
|
|
||
| for (auto i = 0; i < num_keys; ++i) { | ||
| output_begin[i] = {static_cast<Key>(gen()), static_cast<Value>(gen())}; | ||
| } | ||
| } | ||
|
Comment on lines
+19
to
+46
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be replaced by
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This won't work directly, as this function is generating key-value pairs rather than just keys. We could use |
||
|
|
||
| template <typename Key, typename Value, int NumKeys, | ||
| bool FavorInsertionPerformance> | ||
| static void BM_insert(::benchmark::State& state) | ||
| { | ||
| for (auto _ : state) { | ||
| state.PauseTiming(); | ||
|
|
||
| priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>, | ||
| FavorInsertionPerformance> pq(NumKeys); | ||
|
|
||
| std::vector<pair<Key, Value>> h_pairs(NumKeys); | ||
| generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end()); | ||
| thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs); | ||
|
|
||
| state.ResumeTiming(); | ||
| pq.push(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| template <typename Key, typename Value, int NumKeys, | ||
| bool FavorInsertionPerformance> | ||
| static void BM_delete(::benchmark::State& state) | ||
| { | ||
| for (auto _ : state) { | ||
| state.PauseTiming(); | ||
|
|
||
| priority_queue<pair<Key, Value>, pair_less<pair<Key, Value>>, | ||
| FavorInsertionPerformance> pq(NumKeys); | ||
|
|
||
| std::vector<pair<Key, Value>> h_pairs(NumKeys); | ||
| generate_keys_uniform<Key, Value>(h_pairs.begin(), h_pairs.end()); | ||
| thrust::device_vector<pair<Key, Value>> d_pairs(h_pairs); | ||
|
|
||
| pq.push(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
|
|
||
| state.ResumeTiming(); | ||
| pq.pop(d_pairs.begin(), d_pairs.end()); | ||
| cudaDeviceSynchronize(); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, false) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 128'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 128'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_insert, int, int, 256'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
|
|
||
| BENCHMARK_TEMPLATE(BM_delete, int, int, 256'000'000, true) | ||
| ->Unit(benchmark::kMillisecond); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| #pragma once | ||
| #include <cmath> | ||
|
|
||
| #include <cuco/detail/priority_queue_kernels.cuh> | ||
| #include <cuco/detail/error.hpp> | ||
|
|
||
| namespace cuco { | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::priority_queue | ||
| (size_t initial_capacity, | ||
| Allocator const& allocator) : | ||
| allocator_{allocator}, | ||
| int_allocator_{allocator}, | ||
| t_allocator_{allocator}, | ||
| size_t_allocator_{allocator} { | ||
|
|
||
| node_size_ = NodeSize; | ||
|
|
||
| // Round up to the nearest multiple of node size | ||
| int nodes = ((initial_capacity + node_size_ - 1) / node_size_); | ||
|
|
||
| node_capacity_ = nodes; | ||
| lowest_level_start_ = 1 << (int)log2(nodes); | ||
|
|
||
| // Allocate device variables | ||
|
|
||
| d_size_ = std::allocator_traits<int_allocator_type>::allocate(int_allocator_, | ||
| 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_size_, 0, sizeof(int))); | ||
|
|
||
| d_p_buffer_size_ = std::allocator_traits<size_t_allocator_type> | ||
| ::allocate(size_t_allocator_, 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_p_buffer_size_, 0, sizeof(size_t))); | ||
|
|
||
| d_heap_ = std::allocator_traits<t_allocator_type> | ||
| ::allocate(t_allocator_, | ||
| node_capacity_ * node_size_ + node_size_); | ||
|
|
||
| d_locks_ = std::allocator_traits<int_allocator_type> | ||
| ::allocate(int_allocator_, node_capacity_ + 1); | ||
|
|
||
| CUCO_CUDA_TRY(cudaMemset(d_locks_, 0, | ||
| sizeof(int) * (node_capacity_ + 1))); | ||
|
|
||
|
|
||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::~priority_queue() { | ||
| std::allocator_traits<int_allocator_type>::deallocate(int_allocator_, | ||
| d_size_, 1); | ||
| std::allocator_traits<size_t_allocator_type>::deallocate(size_t_allocator_, | ||
| d_p_buffer_size_, 1); | ||
| std::allocator_traits<t_allocator_type>::deallocate(t_allocator_, | ||
| d_heap_, | ||
| node_capacity_ * node_size_ + node_size_); | ||
| std::allocator_traits<int_allocator_type>::deallocate(int_allocator_, | ||
| d_locks_, | ||
| node_capacity_ + 1); | ||
| } | ||
|
|
||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename InputIt> | ||
| void priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::push(InputIt first, | ||
| InputIt last, | ||
| cudaStream_t stream) { | ||
|
|
||
| const int kBlockSize = min(256, (int)node_size_); | ||
|
PointKernel marked this conversation as resolved.
Outdated
|
||
| const int kNumBlocks = min(64000, | ||
| max(1, (int)((last - first) / node_size_))); | ||
|
|
||
| PushKernel<<<kNumBlocks, kBlockSize, | ||
| get_shmem_size(kBlockSize), stream>>> | ||
| (first, last - first, d_heap_, d_size_, | ||
| node_size_, d_locks_, d_p_buffer_size_, lowest_level_start_, | ||
| compare_); | ||
|
|
||
| CUCO_CUDA_TRY(cudaGetLastError()); | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename OutputIt> | ||
| void priority_queue<T, Compare, FavorInsertionPerformance, | ||
| Allocator>::pop(OutputIt first, | ||
| OutputIt last, | ||
| cudaStream_t stream) { | ||
|
|
||
| int pop_size = last - first; | ||
| const int partial = pop_size % node_size_; | ||
|
|
||
| const int kBlockSize = min(256, (int)node_size_); | ||
| const int kNumBlocks = min(64000, | ||
| max(1, (int)((pop_size - partial) / node_size_))); | ||
|
|
||
| PopKernel<<<kNumBlocks, kBlockSize, | ||
| get_shmem_size(kBlockSize), stream>>> | ||
| (first, pop_size, d_heap_, d_size_, | ||
| node_size_, d_locks_, d_p_buffer_size_, | ||
| lowest_level_start_, node_capacity_, compare_); | ||
|
|
||
| CUCO_CUDA_TRY(cudaGetLastError()); | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename CG, typename InputIt> | ||
| __device__ void priority_queue<T, Compare, | ||
| FavorInsertionPerformance, Allocator> | ||
| ::device_mutable_view::push( | ||
| CG const& g, | ||
| InputIt first, | ||
| InputIt last, | ||
| void *temp_storage) { | ||
|
|
||
| SharedMemoryLayout<T> shmem = | ||
| GetSharedMemoryLayout<T>((int*)temp_storage, | ||
| g.size(), node_size_); | ||
|
|
||
| auto push_size = last - first; | ||
| for (size_t i = 0; i < push_size / node_size_; i++) { | ||
| PushSingleNode(g, first + i * node_size_, d_heap_, d_size_, node_size_, | ||
| d_locks_, lowest_level_start_, shmem, compare_); | ||
| } | ||
|
|
||
| if (push_size % node_size_ != 0) { | ||
| PushPartialNode(g, first + (push_size / node_size_) * node_size_, | ||
| push_size % node_size_, d_heap_, | ||
| d_size_, node_size_, d_locks_, | ||
| d_p_buffer_size_, lowest_level_start_, shmem, | ||
| compare_); | ||
| } | ||
| } | ||
|
|
||
| template <typename T, typename Compare, bool FavorInsertionPerformance, | ||
| typename Allocator> | ||
| template <typename CG, typename OutputIt> | ||
| __device__ void priority_queue<T, Compare, | ||
| FavorInsertionPerformance, Allocator> | ||
| ::device_mutable_view::pop( | ||
| CG const& g, | ||
| OutputIt first, | ||
| OutputIt last, | ||
| void *temp_storage) { | ||
| SharedMemoryLayout<T> shmem = | ||
| GetSharedMemoryLayout<T>((int*)temp_storage, | ||
| g.size(), node_size_); | ||
|
|
||
| auto pop_size = last - first; | ||
| for (size_t i = 0; i < pop_size / node_size_; i++) { | ||
| PopSingleNode(g, first + i * node_size_, | ||
| d_heap_, d_size_, node_size_, d_locks_, | ||
| d_p_buffer_size_, lowest_level_start_, | ||
| node_capacity_, shmem, compare_); | ||
| } | ||
|
|
||
| if (pop_size % node_size_ != 0) { | ||
| PopPartialNode(g, first + (pop_size / node_size_) * node_size_, | ||
| last - first, d_heap_, d_size_, node_size_, | ||
| d_locks_, d_p_buffer_size_, lowest_level_start_, | ||
| node_capacity_, shmem, compare_); | ||
| } | ||
| } | ||
|
|
||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.