BitMagic-C++
bmrandom.h
Go to the documentation of this file.
1#ifndef BMRANDOM__H__INCLUDED__
2#define BMRANDOM__H__INCLUDED__
3/*
4Copyright(c) 2002-2019 Anatoliy Kuznetsov(anatoliy_kuznetsov at yahoo.com)
5
6Licensed under the Apache License, Version 2.0 (the "License");
7you may not use this file except in compliance with the License.
8You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12Unless required by applicable law or agreed to in writing, software
13distributed under the License is distributed on an "AS IS" BASIS,
14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15See the License for the specific language governing permissions and
16limitations under the License.
17
18For more information please visit: http://bitmagic.io
19*/
20
21/*! \file bmrandom.h
22 \brief Generation of random subset
23*/
24
25#ifndef BM__H__INCLUDED__
26// BitMagic utility headers do not include main "bm.h" declaration
27// #include "bm.h" or "bm64.h" explicitly
28# error missing include (bm.h or bm64.h)
29#endif
30
31
32#include "bmfunc.h"
33#include "bmdef.h"
34
35#include <stdlib.h>
36#include <random>
37#include <algorithm>
38
39namespace bm
40{
41
42
43/*!
44 Class implements algorithm for random subset generation.
45
46 Implemented method tries to be fair, but doesn't guarantee
47 true randomeness or absense of bias.
48
49 Performace note:
50 Class holds temporary buffers and variables, so it is recommended to
51 re-use instances over multiple calls.
52
53 \ingroup setalgo
54*/
55template<class BV>
57{
58public:
59 typedef BV bvector_type;
60 typedef typename BV::size_type size_type;
61public:
64
65 /// Get random subset of input vector
66 ///
67 /// @param bv_out - destination vector
68 /// @param bv_in - input vector
69 /// @param sample_count - number of bits to pick
70 ///
71 void sample(BV& bv_out, const BV& bv_in, size_type sample_count);
72
73
74private:
75 typedef
76 typename BV::blocks_manager_type blocks_manager_type;
77
78private:
79 /// simple picking algorithm for small number of items
80 /// in [first,last] range
81 ///
82 void simple_pick(BV& bv_out,
83 const BV& bv_in,
84 size_type sample_count,
85 size_type first,
86 size_type last);
87
88 void get_subset(BV& bv_out,
89 const BV& bv_in,
90 size_type in_count,
91 size_type sample_count);
92
93 void get_block_subset(bm::word_t* blk_out,
94 const bm::word_t* blk_src,
95 unsigned count);
96 static
97 unsigned process_word(bm::word_t* blk_out,
98 const bm::word_t* blk_src,
99 unsigned nword,
100 unsigned take_count) BMNOEXCEPT;
101
102 static
103 void get_random_array(bm::word_t* blk_out,
105 unsigned bit_list_size,
106 unsigned count);
107 static
108 unsigned compute_take_count(unsigned bc,
109 size_type in_count, size_type sample_count) BMNOEXCEPT;
110
111
112private:
114 random_subset& operator=(const random_subset&);
115private:
116 typename bvector_type::rs_index_type rsi_; ///< RS index (block counts)
117 bvector_type bv_nb_; ///< blocks vector
119 bm::word_t* sub_block_;
120};
121
122
123///////////////////////////////////////////////////////////////////
124
125
126
127template<class BV>
132
133template<class BV>
135{
136 delete [] sub_block_;
137}
138
139template<class BV>
141 const BV& bv_in,
142 size_type sample_count)
143{
144 if (sample_count == 0)
145 {
146 bv_out.clear(true);
147 return;
148 }
149
150 rsi_.init();
151 bv_in.build_rs_index(&rsi_, &bv_nb_);
152
153 size_type in_count = rsi_.count();
154 if (in_count <= sample_count)
155 {
156 bv_out = bv_in;
157 return;
158 }
159
160 float pick_ratio = float(sample_count) / float(in_count);
161 if (pick_ratio < 0.054f)
162 {
163 size_type first, last;
164 bool b = bv_in.find_range(first, last);
165 if (!b)
166 return;
167
168 simple_pick(bv_out, bv_in, sample_count, first, last);
169 return;
170 }
171
172 if (sample_count > in_count/2)
173 {
174 // build the complement vector and subtract it
175 BV tmp_bv;
176 size_type delta_count = in_count - sample_count;
177
178 get_subset(tmp_bv, bv_in, in_count, delta_count);
179 bv_out = bv_in;
180 bv_out -= tmp_bv;
181 return;
182 }
183 get_subset(bv_out, bv_in, in_count, sample_count);
184}
185
186template<class BV>
187void random_subset<BV>::simple_pick(BV& bv_out,
188 const BV& bv_in,
189 size_type sample_count,
190 size_type first,
191 size_type last)
192{
193 bv_out.clear(true);
194
195 std::random_device rd;
196 #ifdef BM64ADDR
197 std::mt19937_64 mt_rand(rd());
198 #else
199 std::mt19937 mt_rand(rd());
200 #endif
201 std::uniform_int_distribution<size_type> dist(first, last);
202
203 while (sample_count)
204 {
205 size_type fidx;
206 size_type ridx = dist(mt_rand); // generate random position
207
208 BM_ASSERT(ridx >= first && ridx <= last);
209
210 bool b = bv_in.find(ridx, fidx); // find next valid bit after random
211 BM_ASSERT(b);
212 if (b)
213 {
214 // set true if was false
215 bool is_set = bv_out.set_bit_conditional(fidx, true, false);
216 sample_count -= is_set;
217 while (!is_set) // find next valid (and not set) bit
218 {
219 ++fidx;
220 // searching always left to right may create a bias...
221 b = bv_in.find(fidx, fidx);
222 if (!b)
223 break;
224 if (fidx > last)
225 break;
226 is_set = bv_out.set_bit_conditional(fidx, true, false);
227 sample_count -= is_set;
228 } // while
229 }
230 } // while
231}
232
233
234template<class BV>
235void random_subset<BV>::get_subset(BV& bv_out,
236 const BV& bv_in,
237 size_type in_count,
238 size_type sample_count)
239{
240 bv_out.clear(true);
241 bv_out.resize(bv_in.size());
242
243 const blocks_manager_type& bman_in = bv_in.get_blocks_manager();
244 blocks_manager_type& bman_out = bv_out.get_blocks_manager();
245
246 bm::word_t* tmp_block = bman_out.check_allocate_tempblock();
247
248 size_type first_nb, last_nb;
249 bool b = bv_nb_.find_range(first_nb, last_nb);
250 BM_ASSERT(b);
251 if (!b)
252 return;
253
254 std::random_device rd;
255 #ifdef BM64ADDR
256 std::mt19937_64 mt_rand(rd());
257 #else
258 std::mt19937 mt_rand(rd());
259 #endif
260 std::uniform_int_distribution<size_type> dist_nb(first_nb, last_nb);
261
262 size_type curr_sample_count = sample_count;
263 for (unsigned take_count = 0; curr_sample_count; curr_sample_count -= take_count)
264 {
265 // pick block at random
266 //
267 size_type nb;
268 size_type ridx = dist_nb(mt_rand); // generate random block idx
269 BM_ASSERT(ridx >= first_nb && ridx <= last_nb);
270
271 b = bv_nb_.find(ridx, nb); // find next valid nb
272 if (!b)
273 {
274 b = bv_nb_.find(first_nb, nb);
275 if (!b)
276 {
277 b = bv_nb_.find(first_nb, nb);
278 BM_ASSERT(!bv_nb_.any());
279 BM_ASSERT(b);
280 return; // cannot find block
281 }
282 }
283 bv_nb_.clear_bit_no_check(nb); // remove from blocks list
284
285 // calculate proportinal sample count
286 //
287 unsigned bc = rsi_.count(nb);
288 BM_ASSERT(bc && (bc <= 65536));
289 take_count = compute_take_count(bc, in_count, sample_count);
290 if (take_count > curr_sample_count)
291 take_count = unsigned(curr_sample_count);
292 BM_ASSERT(take_count);
293 if (!take_count)
294 continue;
295 {
296 unsigned i0, j0;
297 bm::get_block_coord(nb, i0, j0);
298 const bm::word_t* blk_src = bman_in.get_block(i0, j0);
299 BM_ASSERT(blk_src);
300
301 // allocate target block
302 bm::word_t* blk_out = bman_out.get_block_ptr(i0, j0);
303 BM_ASSERT(!blk_out);
304 if (blk_out)
305 {
306 blk_out = bman_out.deoptimize_block(nb);
307 }
308 else
309 {
310 blk_out = bman_out.get_allocator().alloc_bit_block();
311 bman_out.set_block(nb, blk_out);
312 }
313 if (take_count == bc) // whole block take (strange)
314 {
315 // copy the whole src block
316 if (BM_IS_GAP(blk_src))
317 bm::gap_convert_to_bitset(blk_out, BMGAP_PTR(blk_src));
318 else
319 bm::bit_block_copy(blk_out, blk_src);
320 continue;
321 }
322 bm::bit_block_set(blk_out, 0);
323 if (bc < 4096) // use array shuffle
324 {
325 unsigned arr_len;
326 // convert source block to bit-block
327 if (BM_IS_GAP(blk_src))
328 {
329 arr_len = bm::gap_convert_to_arr(bit_list_,
330 BMGAP_PTR(blk_src),
332 }
333 else // bit-block
334 {
335 arr_len = bm::bit_convert_to_arr(bit_list_,
336 blk_src,
339 0);
340 }
341 BM_ASSERT(arr_len);
342 get_random_array(blk_out, bit_list_, arr_len, take_count);
343 }
344 else // dense block
345 {
346 // convert source block to bit-block
347 if (BM_IS_GAP(blk_src))
348 {
349 bm::gap_convert_to_bitset(tmp_block, BMGAP_PTR(blk_src));
350 blk_src = tmp_block;
351 }
352 // pick random bits source block to target
353 get_block_subset(blk_out, blk_src, take_count);
354 }
355 }
356 } // for
357}
358
359template<class BV>
360unsigned random_subset<BV>::compute_take_count(
361 unsigned bc,
362 size_type in_count,
363 size_type sample_count) BMNOEXCEPT
364{
365 float block_percent = float(bc) / float(in_count);
366 float bits_to_take = float(sample_count) * block_percent;
367 bits_to_take += 0.99f;
368 unsigned to_take = unsigned(bits_to_take);
369 if (to_take > bc)
370 to_take = bc;
371 return to_take;
372}
373
374
375template<class BV>
376void random_subset<BV>::get_block_subset(bm::word_t* blk_out,
377 const bm::word_t* blk_src,
378 unsigned take_count)
379{
380 for (unsigned rounds = 0; take_count && rounds < 10; ++rounds)
381 {
382 // pick random scan start and scan direction
383 //
384 unsigned i = unsigned(rand()) % bm::set_block_size;
385 unsigned new_count;
386
387 for (; i < bm::set_block_size && take_count; ++i)
388 {
389 if (blk_src[i] && (blk_out[i] != blk_src[i]))
390 {
391 new_count = process_word(blk_out, blk_src, i, take_count);
392 take_count -= new_count;
393 }
394 } // for i
395
396 } // for
397 // if masked scan did not produce enough results..
398 //
399 if (take_count)
400 {
401 // Find all vacant bits: do logical (src SUB out)
402 for (unsigned i = 0; i < bm::set_block_size; ++i)
403 {
404 sub_block_[i] = blk_src[i] & ~blk_out[i];
405 }
406 // now transform vacant bits to array, then pick random elements
407 //
408 unsigned arr_len = bm::bit_convert_to_arr(bit_list_,
409 sub_block_,
412 0);
413 BM_ASSERT(arr_len);
414 get_random_array(blk_out, bit_list_, arr_len, take_count);
415 }
416}
417
418template<class BV>
419unsigned random_subset<BV>::process_word(bm::word_t* blk_out,
420 const bm::word_t* blk_src,
421 unsigned nword,
422 unsigned take_count) BMNOEXCEPT
423{
424 unsigned new_bits, mask;
425 do
426 {
427 mask = unsigned(rand());
428 mask ^= mask << 16;
429 } while (mask == 0);
430
431 unsigned src_rand = blk_src[nword] & mask;
432 new_bits = src_rand & ~blk_out[nword];
433 if (new_bits)
434 {
435 unsigned new_count = bm::word_bitcount(new_bits);
436
437 // check if we accidentally picked more bits than needed
438 if (new_count > take_count)
439 {
440 BM_ASSERT(take_count);
441
442 unsigned char blist[64];
443 unsigned arr_size = bm::bitscan_popcnt(new_bits, blist);
444 BM_ASSERT(arr_size == new_count);
445 std::random_shuffle(blist, blist + arr_size);
446 unsigned value = 0;
447 for (unsigned j = 0; j < take_count; ++j)
448 {
449 value |= (1u << blist[j]);
450 }
451 new_bits = value;
452 new_count = take_count;
453
454 BM_ASSERT(bm::word_bitcount(new_bits) == take_count);
455 BM_ASSERT((new_bits & ~blk_src[nword]) == 0);
456 }
457
458 blk_out[nword] |= new_bits;
459 return new_count;
460 }
461 return 0;
462}
463
464
465template<class BV>
466void random_subset<BV>::get_random_array(bm::word_t* blk_out,
468 unsigned bit_list_size,
469 unsigned count)
470{
471 std::random_shuffle(bit_list, bit_list + bit_list_size);
472 for (unsigned i = 0; i < count; ++i)
473 {
474 bm::set_bit(blk_out, bit_list[i]);
475 }
476}
477
478} // namespace
479
480#include "bmundef.h"
481
482#endif
Definitions(internal)
#define BMNOEXCEPT
Definition bmdef.h:79
#define BMGAP_PTR(ptr)
Definition bmdef.h:179
#define BM_IS_GAP(ptr)
Definition bmdef.h:181
#define BM_ASSERT
Definition bmdef.h:130
Bit manipulation primitives (internal)
pre-processor un-defines to avoid global space pollution (internal)
rs_index< allocator_type > rs_index_type
Definition bm.h:831
void sample(BV &bv_out, const BV &bv_in, size_type sample_count)
Get random subset of input vector.
Definition bmrandom.h:140
BV::size_type size_type
Definition bmrandom.h:60
BMFORCEINLINE bm::id_t word_bitcount(bm::id_t w) BMNOEXCEPT
Definition bmfunc.h:197
void bit_block_copy(bm::word_t *BMRESTRICT dst, const bm::word_t *BMRESTRICT src) BMNOEXCEPT
Bitblock copy operation.
Definition bmfunc.h:5898
T bit_convert_to_arr(T *BMRESTRICT dest, const unsigned *BMRESTRICT src, bm::id_t bits, unsigned dest_len, unsigned mask=0) BMNOEXCEPT
Convert bit block into an array of ints corresponding to 1 bits.
Definition bmfunc.h:7822
BMFORCEINLINE void set_bit(unsigned *dest, unsigned bitpos) BMNOEXCEPT
Set 1 bit in a block.
Definition bmfunc.h:3125
void bit_block_set(bm::word_t *BMRESTRICT dst, bm::word_t value) BMNOEXCEPT
Bitblock memset operation.
Definition bmfunc.h:3841
unsigned short bitscan_popcnt(bm::id_t w, B *bits, unsigned short offs) BMNOEXCEPT
Unpacks word into list of ON bit indexes using popcnt method.
Definition bmfunc.h:475
unsigned bit_list(T w, B *bits) BMNOEXCEPT
Unpacks word into list of ON bit indexes.
Definition bmfunc.h:438
D gap_convert_to_arr(D *BMRESTRICT dest, const T *BMRESTRICT buf, unsigned dest_len, bool invert=false) BMNOEXCEPT
Convert gap block into array of ints corresponding to 1 bits.
Definition bmfunc.h:4320
void gap_convert_to_bitset(unsigned *BMRESTRICT dest, const T *BMRESTRICT buf) BMNOEXCEPT
GAP block to bitblock conversion.
Definition bmfunc.h:3859
Definition bm.h:77
unsigned int word_t
Definition bmconst.h:38
const unsigned set_block_size
Definition bmconst.h:54
unsigned short gap_word_t
Definition bmconst.h:77
const unsigned gap_max_bits
Definition bmconst.h:80
void get_block_coord(BI_TYPE nb, unsigned &i, unsigned &j) BMNOEXCEPT
Recalc linear bvector block index into 2D matrix coordinates.
Definition bmfunc.h:147