
I did some performance test, with 4 threads -O3 and ITERATIONS = 10.000.000, and there is no performance difference between the two versions (see below). My opinion is that we should stick with a standard random engine concept I used, merge our code (we both have non-overlapping bits that are needed), and think a bit more about random function concepts at a later stage. For me a random engine is al I need. timings: 7534.86 7603.93 ms timings: 7640.01 7533.52 ms timings: 7648.3 7604.28 ms timings: 7595.59 7626.43 ms timings: 6767.75 6432.64 ms timings: 7635.71 7535.7 ms timings: 7604.28 7618.24 ms timings: 7734.14 7091.38 ms timings: 7366.14 7581.43 ms timings: 7559.15 7168.32 ms timings: 7538.35 7355.96 ms timings: 7476.68 7702.23 ms timings: 6757.96 7545.04 ms timings: 7607.6 7608.7 ms timings: 7608.16 7657.03 ms timings: 6886.66 7698.5 ms timings: 6833.46 6466.57 ms timings: 7624.62 6443.1 ms timings: 6741.91 6464.72 ms timings: 7464.41 6428.27 ms left: void thermalize(atom* atoms, size_t Natoms, uint32_t timestep, const Prf::key_type& key){ counter_based_engine<Prf, 32> cbeng(key); for(size_t i=0; i<Natoms; ++i){ float rmsvelocity = sqrt(kT/atoms[i].mass); normal_distribution<float> mbd(0., rmsvelocity); Prf::domain_type start = {atoms[i].id, timestep, THERMALIZE_CTXT}; cbeng.restart(start); for (size_t j=0; j<ITERATIONS; ++j) { atoms[i].vx += mbd(cbeng); atoms[i].vy += mbd(cbeng); atoms[i].vz += mbd(cbeng); } } } right: void thermalize_std(atom* atoms, size_t Natoms, uint32_t timestep){ for(size_t i=0; i<Natoms; ++i){ float rmsvelocity = sqrt(kT/atoms[i].mass); normal_distribution<float> mbd(0., rmsvelocity); threefry4x64_engine<uint32_t, 32, 20, 1, 1> eng(i); for (size_t j=0; j<ITERATIONS; ++j) { atoms[i].vx += mbd(eng); atoms[i].vy += mbd(eng); atoms[i].vz += mbd(eng); } } }