44 #ifndef KOKKOS_MEMORYPOOL_HPP 45 #define KOKKOS_MEMORYPOOL_HPP 47 #include <Kokkos_Core_fwd.hpp> 50 #include <impl/Kokkos_ConcurrentBitset.hpp> 51 #include <impl/Kokkos_Error.hpp> 52 #include <impl/Kokkos_SharedAlloc.hpp> 56 template<
typename DeviceType >
60 typedef typename Kokkos::Impl::concurrent_bitset CB ;
62 enum : uint32_t { bits_per_int_lg2 = CB::bits_per_int_lg2 };
63 enum : uint32_t { state_shift = CB::state_shift };
64 enum : uint32_t { state_used_mask = CB::state_used_mask };
65 enum : uint32_t { state_header_mask = CB::state_header_mask };
66 enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
67 enum : uint32_t { max_bit_count = CB::max_bit_count };
70 enum : uint32_t { MIN_BLOCK_SIZE_LG2 = 6 };
71 enum : uint32_t { MAX_BLOCK_SIZE_LG2 = 12 };
72 enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16 };
74 enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
88 typedef typename DeviceType::memory_space base_memory_space ;
92 , base_memory_space >::accessible };
94 typedef Kokkos::Impl::SharedAllocationTracker Tracker ;
95 typedef Kokkos::Impl::SharedAllocationRecord
96 < base_memory_space > Record ;
99 uint32_t * m_sb_state_array ;
100 uint32_t m_sb_state_size ;
101 uint32_t m_sb_size_lg2 ;
102 uint32_t m_max_block_size_lg2 ;
103 uint32_t m_min_block_size_lg2 ;
105 int32_t m_hint_offset ;
106 int32_t m_data_offset ;
107 int32_t m_unused_padding ;
113 KOKKOS_INLINE_FUNCTION
114 size_t capacity() const noexcept
115 {
return size_t(m_sb_count) << m_sb_size_lg2 ; }
117 KOKKOS_INLINE_FUNCTION
118 size_t min_block_size() const noexcept
119 {
return ( 1LU << m_min_block_size_lg2 ); }
121 KOKKOS_INLINE_FUNCTION
122 size_t max_block_size() const noexcept
123 {
return ( 1LU << m_max_block_size_lg2 ); }
125 struct usage_statistics {
126 size_t capacity_bytes ;
127 size_t superblock_bytes ;
128 size_t max_block_bytes ;
129 size_t min_block_bytes ;
130 size_t capacity_superblocks ;
131 size_t consumed_superblocks ;
132 size_t consumed_blocks ;
133 size_t consumed_bytes ;
134 size_t reserved_blocks ;
135 size_t reserved_bytes ;
138 void get_usage_statistics( usage_statistics & stats )
const 142 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
144 uint32_t *
const sb_state_array =
145 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
147 if ( ! accessible ) {
148 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
149 ( sb_state_array , m_sb_state_array , alloc_size );
152 stats.superblock_bytes = ( 1LU << m_sb_size_lg2 );
153 stats.max_block_bytes = ( 1LU << m_max_block_size_lg2 );
154 stats.min_block_bytes = ( 1LU << m_min_block_size_lg2 );
155 stats.capacity_bytes = stats.superblock_bytes * m_sb_count ;
156 stats.capacity_superblocks = m_sb_count ;
157 stats.consumed_superblocks = 0 ;
158 stats.consumed_blocks = 0 ;
159 stats.consumed_bytes = 0 ;
160 stats.reserved_blocks = 0 ;
161 stats.reserved_bytes = 0 ;
163 const uint32_t * sb_state_ptr = sb_state_array ;
165 for ( int32_t i = 0 ; i < m_sb_count
166 ; ++i , sb_state_ptr += m_sb_state_size ) {
168 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
170 if ( block_count_lg2 ) {
171 const uint32_t block_count = 1u << block_count_lg2 ;
172 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
173 const uint32_t block_size = 1u << block_size_lg2 ;
174 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
176 stats.consumed_superblocks++ ;
177 stats.consumed_blocks += block_used ;
178 stats.consumed_bytes += block_used * block_size ;
179 stats.reserved_blocks += block_count - block_used ;
180 stats.reserved_bytes += (block_count - block_used ) * block_size ;
184 if ( ! accessible ) {
185 host.
deallocate( sb_state_array, alloc_size );
189 void print_state( std::ostream & s )
const 193 const size_t alloc_size = m_hint_offset *
sizeof(uint32_t);
195 uint32_t *
const sb_state_array =
196 accessible ? m_sb_state_array : (uint32_t *) host.
allocate(alloc_size);
198 if ( ! accessible ) {
199 Kokkos::Impl::DeepCopy< Kokkos::HostSpace , base_memory_space >
200 ( sb_state_array , m_sb_state_array , alloc_size );
203 const uint32_t * sb_state_ptr = sb_state_array ;
205 s <<
"pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) <<
")" 206 <<
" superblock_size(" << ( 1 << m_sb_size_lg2 ) <<
")" << std::endl ;
208 for ( int32_t i = 0 ; i < m_sb_count
209 ; ++i , sb_state_ptr += m_sb_state_size ) {
211 if ( *sb_state_ptr ) {
213 const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
214 const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
215 const uint32_t block_count = 1 << block_count_lg2 ;
216 const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
218 s <<
"Superblock[ " << i <<
" / " << m_sb_count <<
" ] {" 219 <<
" block_size(" << ( 1 << block_size_lg2 ) <<
")" 220 <<
" block_count( " << block_used
221 <<
" / " << block_count <<
" )" 226 if ( ! accessible ) {
227 host.
deallocate( sb_state_array, alloc_size );
233 MemoryPool() = default ;
234 MemoryPool( MemoryPool && ) = default ;
235 MemoryPool(
const MemoryPool & ) = default ;
236 MemoryPool & operator = ( MemoryPool && ) = default ;
237 MemoryPool & operator = (
const MemoryPool & ) = default ;
253 MemoryPool(
const base_memory_space & memspace
254 ,
const size_t min_total_alloc_size
255 ,
const uint32_t min_block_alloc_size
256 ,
const uint32_t max_block_alloc_size
257 ,
const uint32_t min_superblock_size
260 , m_sb_state_array(0)
263 , m_max_block_size_lg2(0)
264 , m_min_block_size_lg2(0)
268 , m_unused_padding(0)
270 const uint32_t int_align_lg2 = 3 ;
271 const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
275 m_min_block_size_lg2 =
276 Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
278 m_max_block_size_lg2 =
279 Kokkos::Impl::integral_power_of_two_that_contains(max_block_alloc_size);
282 Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
288 if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
289 m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
291 if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
292 m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
294 if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
295 m_max_block_size_lg2 = m_min_block_size_lg2 ;
297 if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
298 m_sb_size_lg2 = m_max_block_size_lg2 ;
303 if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
304 m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
310 const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
312 m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
317 const uint32_t max_block_count_lg2 =
318 m_sb_size_lg2 - m_min_block_size_lg2 ;
321 ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
325 const size_t all_sb_state_size =
326 ( m_sb_count * m_sb_state_size + int_align_mask ) & ~int_align_mask ;
330 const int32_t number_block_sizes =
331 1 + m_max_block_size_lg2 - m_min_block_size_lg2 ;
336 const int32_t block_size_array_size =
337 ( number_block_sizes + int_align_mask ) & ~int_align_mask ;
339 m_hint_offset = all_sb_state_size ;
340 m_data_offset = m_hint_offset +
341 block_size_array_size * HINT_PER_BLOCK_SIZE ;
345 const size_t header_size = m_data_offset *
sizeof(uint32_t);
346 const size_t alloc_size = header_size +
347 ( size_t(m_sb_count) << m_sb_size_lg2 );
349 Record * rec = Record::allocate( memspace ,
"MemoryPool" , alloc_size );
351 m_tracker.assign_allocated_record_to_uninitialized( rec );
353 m_sb_state_array = (uint32_t *) rec->data();
357 uint32_t *
const sb_state_array =
358 accessible ? m_sb_state_array
359 : (uint32_t *) host.
allocate(header_size);
361 for ( int32_t i = 0 ; i < m_data_offset ; ++i ) sb_state_array[i] = 0 ;
365 for ( int32_t i = 0 ; i < number_block_sizes ; ++i ) {
366 const uint32_t block_size_lg2 = i + m_min_block_size_lg2 ;
367 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
368 const uint32_t block_state = block_count_lg2 << state_shift ;
369 const uint32_t hint_begin = m_hint_offset + i * HINT_PER_BLOCK_SIZE ;
375 const int32_t jbeg = ( i * m_sb_count ) / number_block_sizes ;
376 const int32_t jend = ( ( i + 1 ) * m_sb_count ) / number_block_sizes ;
378 sb_state_array[ hint_begin ] = uint32_t(jbeg);
379 sb_state_array[ hint_begin + 1 ] = uint32_t(jbeg);
381 for ( int32_t j = jbeg ; j < jend ; ++j ) {
382 sb_state_array[ j * m_sb_state_size ] = block_state ;
388 if ( ! accessible ) {
389 Kokkos::Impl::DeepCopy< base_memory_space , Kokkos::HostSpace >
390 ( m_sb_state_array , sb_state_array , header_size );
392 host.
deallocate( sb_state_array, header_size );
395 Kokkos::memory_fence();
406 KOKKOS_FORCEINLINE_FUNCTION
407 unsigned get_block_size_lg2(
unsigned n )
const noexcept
409 const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
411 return i < m_min_block_size_lg2 ? m_min_block_size_lg2 : i ;
416 KOKKOS_INLINE_FUNCTION
417 uint32_t allocate_block_size( uint32_t alloc_size )
const noexcept
419 return alloc_size <= (1UL << m_max_block_size_lg2)
420 ? ( 1u << get_block_size_lg2( alloc_size ) )
435 void * allocate(
size_t alloc_size
436 , int32_t attempt_limit = 1 ) const noexcept
440 const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
442 if ( block_size_lg2 <= m_max_block_size_lg2 ) {
447 const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
448 const uint32_t block_state = block_count_lg2 << state_shift ;
449 const uint32_t block_count = 1u << block_count_lg2 ;
450 const uint32_t block_count_mask = block_count - 1 ;
456 volatile uint32_t *
const hint_sb_id_ptr
459 + HINT_PER_BLOCK_SIZE
460 * ( block_size_lg2 - m_min_block_size_lg2 );
462 const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] );
468 const uint32_t block_id_hint = block_count_mask &
469 (uint32_t)( Kokkos::Impl::clock_tic()
470 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) 473 + ( threadIdx.x + blockDim.x * threadIdx.y )
479 volatile uint32_t * sb_state_array = 0 ;
481 while ( attempt_limit ) {
483 int32_t hint_sb_id = -1 ;
487 sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
489 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
496 if ( block_state == ( state_header_mask & *sb_state_array ) ) {
502 CB::acquire_bounded_lg2( sb_state_array
513 if ( 0 <= result.
first ) {
517 p = ((
char*)( m_sb_state_array + m_data_offset ))
518 + ( uint32_t(sb_id) << m_sb_size_lg2 )
519 + ( result.
first << block_size_lg2 );
537 int32_t sb_id_empty = -1 ;
539 sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
541 for ( int32_t i = 0 ,
id = sb_id_begin ; i < m_sb_count ; ++i ) {
547 const uint32_t state = *sb_state_array ;
548 const uint32_t used = state & state_used_mask ;
550 if ( block_state == ( state & state_header_mask ) ) {
554 if ( used < block_count ) {
560 if ( used + 1 < block_count ) {
564 Kokkos::atomic_compare_exchange
565 ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
571 else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) {
580 if ( ++
id < m_sb_count ) {
581 sb_state_array += m_sb_state_size ;
585 sb_state_array = m_sb_state_array ;
595 if ( 0 <= sb_id_empty ) {
603 sb_id = sb_id_empty ;
605 sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
610 const uint32_t state_empty = state_header_mask & *sb_state_array ;
613 Kokkos::atomic_compare_exchange
614 (sb_state_array,state_empty,block_state) ) {
618 Kokkos::atomic_compare_exchange
619 ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
632 Kokkos::abort(
"Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
646 KOKKOS_INLINE_FUNCTION
647 void deallocate(
void * p ,
size_t )
const noexcept
651 ((
char*)p) - ((
char*)( m_sb_state_array + m_data_offset ));
654 const int ok_contains =
655 ( 0 <= d ) && (
size_t(d) < ( size_t(m_sb_count) << m_sb_size_lg2 ) );
657 int ok_block_aligned = 0 ;
658 int ok_dealloc_once = 0 ;
662 const int sb_id = d >> m_sb_size_lg2 ;
665 volatile uint32_t *
const sb_state_array =
666 m_sb_state_array + ( sb_id * m_sb_state_size );
668 const uint32_t block_state = (*sb_state_array) & state_header_mask ;
669 const uint32_t block_size_lg2 =
670 m_sb_size_lg2 - ( block_state >> state_shift );
672 ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
674 if ( ok_block_aligned ) {
680 ( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
683 CB::release( sb_state_array , bit , block_state );
685 ok_dealloc_once = 0 <= result ;
696 if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
698 printf(
"Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
700 Kokkos::abort(
"Kokkos MemoryPool::deallocate given erroneous pointer");
void * allocate(const size_t arg_alloc_size) const
Allocate untracked memory in the space.
Replacement for std::pair that works on CUDA devices.
first_type first
The first element of the pair.
Memory management for host memory.
Declaration of parallel operators.
void deallocate(void *const arg_alloc_ptr, const size_t arg_alloc_size) const
Deallocate untracked memory in the space.
Access relationship between DstMemorySpace and SrcMemorySpace.