Open3D (C++ API)  0.18.0
Loading...
Searching...
No Matches
SlabNodeManager.h
Go to the documentation of this file.
1// ----------------------------------------------------------------------------
2// - Open3D: www.open3d.org -
3// ----------------------------------------------------------------------------
4// Copyright (c) 2018-2023 www.open3d.org
5// SPDX-License-Identifier: MIT
6// ----------------------------------------------------------------------------
7
8// Copyright 2019 Saman Ashkiani
9// Rewritten by Wei Dong 2019 - 2020
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13//
14// http://www.apache.org/licenses/LICENSE-2.0
15//
16// Unless required by applicable law or agreed to in writing, software
17// distributed under the License is distributed on an "AS IS" BASIS,
18// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
19// implied. See the License for the specific language governing permissions
20// and limitations under the License.
21
22#pragma once
23
24#include <thrust/device_vector.h>
25
26#include <memory>
27
33
34namespace open3d {
35namespace core {
36
39class Slab {
40public:
43 buf_index_t kv_pair_ptrs[kWarpSize - 1];
46};
47
49public:
51 : super_blocks_(nullptr),
52 hash_coef_(0),
53 num_attempts_(0),
54 memory_block_index_(0),
55 super_block_index_(0) {}
56
57 __device__ __forceinline__ uint32_t* get_unit_ptr_from_slab(
58 const buf_index_t& next_slab_ptr, const uint32_t& lane_id) {
59 return super_blocks_ + addressDecoder(next_slab_ptr) + lane_id;
60 }
61 __device__ __forceinline__ uint32_t* get_ptr_for_bitmap(
62 const uint32_t super_block_idx, const uint32_t bitmap_idx) {
63 return super_blocks_ + super_block_idx * kUIntsPerSuperBlock +
64 bitmap_idx;
65 }
66
67 // Objective: each warp selects its own memory_block warp allocator.
68 __device__ void Init(uint32_t& tid, uint32_t& lane_id) {
69 // Hashing the memory block to be used.
70 createMemBlockIndex(tid >> 5);
71
72 // Loading the assigned memory block.
73 memory_block_bitmap_ =
74 super_blocks_[super_block_index_ * kUIntsPerSuperBlock +
75 memory_block_index_ * kSlabsPerBlock + lane_id];
76 }
77
78 __device__ uint32_t WarpAllocate(const uint32_t& lane_id) {
79 // Try and allocate a new memory units within the memory_block memory
80 // block if it returns 0xFFFFFFFF, then there was not any empty memory
81 // unit a new memory_block block should be chosen, and repeat again
82 // allocated result: 5 bits: super_block_index
83 // 17 bits: memory block index
84 // 5 bits: memory unit index (hi-bits of 10bit)
85 // 5 bits: memory unit index (lo-bits of 10bit)
86 int empty_lane = -1;
87 uint32_t free_lane;
88 uint32_t read_bitmap = memory_block_bitmap_;
89 uint32_t allocated_result = kNotFoundFlag;
90 // Works as long as <31 bit are used in the allocated_result
91 // in other words, if there are 32 super blocks and at most 64k blocks
92 // per super block.
93
94 while (allocated_result == kNotFoundFlag) {
95 empty_lane = __ffs(~memory_block_bitmap_) - 1;
96 free_lane = __ballot_sync(kSyncLanesMask, empty_lane >= 0);
97 if (free_lane == 0) {
98 // all bitmaps are full: need to be rehashed again.
99 updateMemBlockIndex((threadIdx.x + blockIdx.x * blockDim.x) >>
100 5);
101 read_bitmap = memory_block_bitmap_;
102 continue;
103 }
104 uint32_t src_lane = __ffs(free_lane) - 1;
105 if (src_lane == lane_id) {
106 read_bitmap = atomicCAS(
108 super_block_index_ * kUIntsPerSuperBlock +
109 memory_block_index_ * kSlabsPerBlock + lane_id,
110 memory_block_bitmap_,
111 memory_block_bitmap_ | (1 << empty_lane));
112 if (read_bitmap == memory_block_bitmap_) {
113 // Successful attempt.
114 memory_block_bitmap_ |= (1 << empty_lane);
115 allocated_result =
116 (super_block_index_ << kSuperBlockMaskBits) |
117 (memory_block_index_ << kBlockMaskBits) |
118 (lane_id << kSlabMaskBits) | empty_lane;
119 } else {
120 // Not successful: updating the current bitmap.
121 memory_block_bitmap_ = read_bitmap;
122 }
123 }
124 // Asking for the allocated result.
125 allocated_result =
126 __shfl_sync(kSyncLanesMask, allocated_result, src_lane);
127 }
128 return allocated_result;
129 }
130
131 // This function, frees a recently allocated memory unit by a single thread.
132 // Since it is untouched, there shouldn't be any worries for the actual
133 // memory contents to be reset again.
134 __device__ void FreeUntouched(buf_index_t ptr) {
135 atomicAnd(super_blocks_ +
136 getSuperBlockIndex(ptr) * kUIntsPerSuperBlock +
137 getMemBlockIndex(ptr) * kSlabsPerBlock +
138 (getMemUnitIndex(ptr) >> 5),
139 ~(1 << (getMemUnitIndex(ptr) & 0x1F)));
140 }
141
142private:
143 __device__ __host__ __forceinline__ uint32_t
144 getSuperBlockIndex(buf_index_t address) const {
145 return address >> kSuperBlockMaskBits;
146 }
147 __device__ __host__ __forceinline__ uint32_t
148 getMemBlockIndex(buf_index_t address) const {
149 return ((address >> kBlockMaskBits) & 0x1FFFF);
150 }
151 __device__ __host__ __forceinline__ buf_index_t
152 getMemBlockAddress(buf_index_t address) const {
153 return (kBitmapsPerSuperBlock +
154 getMemBlockIndex(address) * kUIntsPerBlock);
155 }
156 __device__ __host__ __forceinline__ uint32_t
157 getMemUnitIndex(buf_index_t address) const {
158 return address & 0x3FF;
159 }
160 __device__ __host__ __forceinline__ buf_index_t
161 getMemUnitAddress(buf_index_t address) {
162 return getMemUnitIndex(address) * kWarpSize;
163 }
164
165 // Called at the beginning of the kernel.
166 __device__ void createMemBlockIndex(uint32_t global_warp_id) {
167 super_block_index_ = global_warp_id % kSuperBlocks;
168 memory_block_index_ = (hash_coef_ * global_warp_id) >>
169 (32 - kBlocksPerSuperBlockInBits);
170 }
171
172 // Called when the allocator fails to find an empty unit to allocate.
173 __device__ void updateMemBlockIndex(uint32_t global_warp_id) {
174 num_attempts_++;
175 super_block_index_++;
176 super_block_index_ =
177 (super_block_index_ == kSuperBlocks) ? 0 : super_block_index_;
178 memory_block_index_ = (hash_coef_ * (global_warp_id + num_attempts_)) >>
179 (32 - kBlocksPerSuperBlockInBits);
180 // Loading the assigned memory block.
181 memory_block_bitmap_ =
182 *((super_blocks_ + super_block_index_ * kUIntsPerSuperBlock) +
183 memory_block_index_ * kSlabsPerBlock + (threadIdx.x & 0x1f));
184 }
185
186 __host__ __device__ buf_index_t
187 addressDecoder(buf_index_t address_ptr_index) {
188 return getSuperBlockIndex(address_ptr_index) * kUIntsPerSuperBlock +
189 getMemBlockAddress(address_ptr_index) +
190 getMemUnitIndex(address_ptr_index) * kWarpSize;
191 }
192
193 __host__ __device__ void print_address(buf_index_t address_ptr_index) {
194 printf("Super block Index: %d, Memory block index: %d, Memory unit "
195 "index: "
196 "%d\n",
197 getSuperBlockIndex(address_ptr_index),
198 getMemBlockIndex(address_ptr_index),
199 getMemUnitIndex(address_ptr_index));
200 }
201
202public:
204 uint32_t* super_blocks_;
206 uint32_t hash_coef_; // A random 32-bit.
207
208private:
210 uint32_t num_attempts_;
211 uint32_t memory_block_index_;
212 uint32_t memory_block_bitmap_;
213 uint32_t super_block_index_;
214};
215
217 uint32_t* slabs_per_superblock);
218
220public:
221 SlabNodeManager(const Device& device) : device_(device) {
224
227 impl_.super_blocks_ = static_cast<uint32_t*>(MemoryManager::Malloc(
228 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t),
229 device_));
230 Reset();
231 }
232
234
235 void Reset() {
236 OPEN3D_CUDA_CHECK(cudaMemset(
237 impl_.super_blocks_, 0xFF,
238 kUIntsPerSuperBlock * kSuperBlocks * sizeof(uint32_t)));
239
240 for (uint32_t i = 0; i < kSuperBlocks; i++) {
241 // setting bitmaps into zeros:
242 OPEN3D_CUDA_CHECK(cudaMemset(
243 impl_.super_blocks_ + i * kUIntsPerSuperBlock, 0x00,
244 kBlocksPerSuperBlock * kSlabsPerBlock * sizeof(uint32_t)));
245 }
247 OPEN3D_CUDA_CHECK(cudaGetLastError());
248 }
249
250 std::vector<int> CountSlabsPerSuperblock() {
251 const uint32_t num_super_blocks = kSuperBlocks;
252
253 thrust::device_vector<uint32_t> slabs_per_superblock(kSuperBlocks);
254 thrust::fill(slabs_per_superblock.begin(), slabs_per_superblock.end(),
255 0);
256
257 // Counting total number of allocated memory units.
258 int num_mem_units = kBlocksPerSuperBlock * 32;
259 int num_cuda_blocks =
260 (num_mem_units + kThreadsPerBlock - 1) / kThreadsPerBlock;
261 CountSlabsPerSuperblockKernel<<<num_cuda_blocks, kThreadsPerBlock, 0,
262 core::cuda::GetStream()>>>(
263 impl_, thrust::raw_pointer_cast(slabs_per_superblock.data()));
265 OPEN3D_CUDA_CHECK(cudaGetLastError());
266
267 std::vector<int> result(num_super_blocks);
268 thrust::copy(slabs_per_superblock.begin(), slabs_per_superblock.end(),
269 result.begin());
270
271 return result;
272 }
273
274public:
277};
278} // namespace core
279} // namespace open3d
Common CUDA utilities.
#define OPEN3D_CUDA_CHECK(err)
Definition CUDAUtils.h:47
core::Tensor result
Definition VtkUtils.cpp:75
Definition Device.h:18
static void * Malloc(size_t byte_size, const Device &device)
Definition MemoryManager.cpp:22
static void Free(void *ptr, const Device &device)
Frees previously allocated memory at address ptr on device device.
Definition MemoryManager.cpp:28
Definition SlabNodeManager.h:39
buf_index_t kv_pair_ptrs[kWarpSize - 1]
Definition SlabNodeManager.h:43
buf_index_t next_slab_ptr
An internal ptr managed by InternalNodeManager.
Definition SlabNodeManager.h:45
Definition SlabNodeManager.h:219
std::vector< int > CountSlabsPerSuperblock()
Definition SlabNodeManager.h:250
SlabNodeManager(const Device &device)
Definition SlabNodeManager.h:221
void Reset()
Definition SlabNodeManager.h:235
~SlabNodeManager()
Definition SlabNodeManager.h:233
SlabNodeManagerImpl impl_
Definition SlabNodeManager.h:275
Device device_
Definition SlabNodeManager.h:276
Definition SlabNodeManager.h:48
uint32_t * super_blocks_
A pointer to each super-block.
Definition SlabNodeManager.h:204
__device__ void Init(uint32_t &tid, uint32_t &lane_id)
Definition SlabNodeManager.h:68
__device__ __forceinline__ uint32_t * get_unit_ptr_from_slab(const buf_index_t &next_slab_ptr, const uint32_t &lane_id)
Definition SlabNodeManager.h:57
uint32_t hash_coef_
hash_coef (register): used as (16 bits, 16 bits) for hashing.
Definition SlabNodeManager.h:206
__device__ __forceinline__ uint32_t * get_ptr_for_bitmap(const uint32_t super_block_idx, const uint32_t bitmap_idx)
Definition SlabNodeManager.h:61
__device__ void FreeUntouched(buf_index_t ptr)
Definition SlabNodeManager.h:134
__device__ uint32_t WarpAllocate(const uint32_t &lane_id)
Definition SlabNodeManager.h:78
SlabNodeManagerImpl()
Definition SlabNodeManager.h:50
void Synchronize()
Definition CUDAUtils.cpp:58
uint32_t buf_index_t
Definition HashBackendBuffer.h:44
__global__ void CountSlabsPerSuperblockKernel(SlabNodeManagerImpl impl, uint32_t *slabs_per_superblock)
const char const char value recording_handle imu_sample recording_handle uint8_t size_t data_size k4a_record_configuration_t config target_format k4a_capture_t capture_handle k4a_imu_sample_t imu_sample playback_handle k4a_logging_message_cb_t void min_level device_handle k4a_imu_sample_t timeout_in_ms capture_handle capture_handle capture_handle image_handle temperature_c k4a_image_t image_handle uint8_t image_handle image_handle image_handle image_handle uint32_t
Definition K4aPlugin.cpp:548
uint32_t RandUint32()
Definition Random.cpp:59
Definition PinholeCameraIntrinsic.cpp:16