Reputation: 587
I'm running the following snippet of code (the std::time objects are just there for benchmarking) that gets u8
elements from a vector of vector of u8
in a given order and creates a new vector with these objects in this order.
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
In my full code this inner loop ends up running hundreds of millions of times so I'm trying to optimise it as much as possible. According to be benchmarking, I seem to be spending the vast majority of the loop time in looking up values from my Vec<Vec<u8>>
, on the line val = *data.get_unchecked(*idx as usize).get_unchecked(j);
, see below which benchmarks some elapsed_first,elapsed_second,elapsed_third
times from different iterations of this loop (the i^th element of each list is from the same run):
First: [27ns, 23ns, 21ns, 24ns, 27ns, 23ns, 28ns, 23ns, 26ns, 23ns, 21ns, 22ns, 27ns, 27ns, 28ns, 23ns, 25ns, 24ns, 26ns, 25ns, 22ns, 24ns, 24ns, 28ns, 28ns, 28ns, 26ns, 22ns, 22ns, 21ns]
Second: [538ns, 695ns, 550ns, 486ns, 627ns, 615ns, 562ns, 570ns, 661ns, 521ns, 617ns, 358ns, 444ns, 560ns, 540ns, 471ns, 656ns, 336ns, 233ns, 209ns, 433ns, 373ns, 1.427µs, 542ns, 708ns, 288ns, 304ns, 608ns, 297ns, 252ns]
Third: [612ns, 736ns, 587ns, 525ns, 665ns, 658ns, 608ns, 614ns, 701ns, 560ns, 656ns, 395ns, 482ns, 606ns, 578ns, 510ns, 696ns, 374ns, 270ns, 246ns, 470ns, 416ns, 1.47µs, 583ns, 751ns, 327ns, 348ns, 645ns, 334ns, 289ns]
I've been trying to understand why this simple vector lookup is the bit that takes by far the most time compared to everything else and still haven't figured it out. Any help is much appreciated!
EDIT: Here is the full function which this code comes from:
pub fn spaced_pbwt(vcf: &VCFData, pbwt_cols: &Vec<SiteRow>, fm_gap: u32) -> SpacedPbwt {
let now = std::time::Instant::now();
let data_positions: Vec<u32> = vcf.positions.clone();
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut insert_positions: Vec<u32> = Vec::new();
let data: &Vec<Vec<u8>> = &vcf.vcf_data;
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in pbwt_cols {
let pos = col.position;
col_set.insert(pos);
n += 1;
}
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
prefixes.push(cur_prefix);
divergences.push(cur_divergence);
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in &vcf.positions {
if !col_set.contains(&col) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
binaries.push(new_add);
is_pbwt_col.push(0);
inserted_positions.push(*col);
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
}
bin_values.push(cur_allele);
let st = *start_point;
if st > p {
p = st;
}
if st > q {
q = st;
}
if cur_allele == 0 {
a.push(idx_val);
d.push(p);
p = 0;
zero_tot += 1;
}
if cur_allele == 1 {
b.push(idx_val);
e.push(q);
q = 0;
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
}
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
prefixes.push(new_prefix);
divergences.push(new_divergence);
binaries.push(bin_values);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
is_pbwt_col.push(1);
pbwt_positions.push(*col);
}
j += 1;
}
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
return SpacedPbwt {
num_samples: m as u32,
num_pbwt_sites: n as u32,
num_inserted_sites: n_other as u32,
num_total_sites: n_full as u32,
pbwt_positions: pbwt_positions,
inserted_positions: inserted_positions,
all_positions: data_positions,
pbwt_col_flags: is_pbwt_col,
bin_pbwt: binaries,
count: count_vec,
occ_list: occ_vec,
fm_gap: fm_gap,
};
}
EDIT EDIT:
Here is a modified version of the file that everybody should be able to run on their machine and does exhibit the behaviour I'm concerned about. It only uses the rand
crate as a dependency:
use rand::{seq::IteratorRandom, thread_rng}; // 0.6.1
use rand::distributions::{Distribution, Uniform};
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Vec<Vec<u8>>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let m = data.len();
let n = data[0].len();
let half_n = n/2;
let mut rng = thread_rng();
let sample: Vec<u32> = (0u32..n as u32).collect();
let perm = sample.iter().choose_multiple(&mut rng, half_n);
let mut cols_to_permute: Vec<u32> = Vec::new();
for i in perm {
cols_to_permute.push(*i);
}
let mut col_set: HashSet<u32> = HashSet::new();
let mut n: usize = 0;
for col in &cols_to_permute {
col_set.insert(*col);
n += 1;
}
let m = data.len();
let n_full = data[0].len();
let n_other = n_full-n;
let mut is_pbwt_col :Vec<u8> = Vec::with_capacity(n_full+1);
let mut pbwt_positions: Vec<u32> = Vec::new();
let mut inserted_positions: Vec<u32> = Vec::new();
let mut prefixes : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut divergences : Vec<Vec<u32>> = Vec::with_capacity(n+1);
let mut binaries: Vec<Vec<u8>> = Vec::with_capacity(n_full+1);
let cur_prefix : Vec<u32> = Vec::from_iter(0..m as u32);
let cur_divergence : Vec<u32> = vec![0; m];
let mut j: usize = 0;
let mut j_pbwt = 0;
let mut count_vec: Vec<u32> = Vec::new();
let mut occ_vec : Vec<Vec<Vec<u32>>> = Vec::new();
prefixes.push(cur_prefix);
divergences.push(cur_divergence);
let mut cur_prefix_ref: &Vec<u32> = &(prefixes[prefixes.len()-1]);
let mut cur_divergence_ref: &Vec<u32> = &divergences[divergences.len()-1];
let mut ct: i32 = 0;
let mut ct_extra: u32 = 0;
let mut zero_tot: u32 = 0;
let mut one_tot: u32 = 0;
let mut occ_positions: Vec<Vec<u32>> = vec![Vec::new(),Vec::new()];
let mut new_add: Vec<u8> = Vec::with_capacity(m);
let mut a: Vec<u32> = Vec::with_capacity(m);
let mut b: Vec<u32> = Vec::with_capacity(m);
let mut d: Vec<u32> = Vec::with_capacity(m);
let mut e: Vec<u32> = Vec::with_capacity(m);
let mut bin_values: Vec<u8> = Vec::with_capacity(m);
let mut elapse1 = Vec::new();
let mut elapse2 = Vec::new();
let mut elapse3 = Vec::new();
for col in 0..n {
if !col_set.contains(&(col as u32)) {
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
occ_positions = vec![Vec::new(),Vec::new()];
new_add = Vec::with_capacity(m);
let mut val: u8;
for idx in cur_prefix_ref.iter() {
let now = std::time::Instant::now();
let elapsed_first = now.elapsed();
unsafe {
val = *data.get_unchecked(*idx as usize).get_unchecked(j);
}
let elapsed_second = now.elapsed();
new_add.push(val);
if val == 0 {
zero_tot += 1;
} else if val == 1 {
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
let elapsed_third = now.elapsed();
elapse1.push(elapsed_first);
elapse2.push(elapsed_second);
elapse3.push(elapsed_third);
}
binaries.push(new_add);
is_pbwt_col.push(0);
inserted_positions.push(col as u32);
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
} else {
a = Vec::with_capacity(m);
b = Vec::with_capacity(m);
d = Vec::with_capacity(m);
e = Vec::with_capacity(m);
bin_values = Vec::with_capacity(m);
let mut p: u32 = j_pbwt+1;
let mut q: u32 = j_pbwt+1;
occ_positions = vec![Vec::new(),Vec::new()];
ct = 0;
ct_extra = 0;
zero_tot = 0;
one_tot = 0;
let mut cur_allele: u8;
for (idx,start_point) in
cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {
let idx_val = *idx;
unsafe {
cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
}
bin_values.push(cur_allele);
let st = *start_point;
if st > p {
p = st;
}
if st > q {
q = st;
}
if cur_allele == 0 {
a.push(idx_val);
d.push(p);
p = 0;
zero_tot += 1;
}
if cur_allele == 1 {
b.push(idx_val);
e.push(q);
q = 0;
one_tot += 1;
}
if (ct == 0) || (ct_extra == fm_gap) {
occ_positions[0].push(zero_tot);
occ_positions[1].push(one_tot);
ct_extra = 0;
}
ct += 1;
ct_extra += 1;
}
let mut new_prefix = a;
new_prefix.append(&mut b);
let mut new_divergence = d;
new_divergence.append(&mut e);
prefixes.push(new_prefix);
divergences.push(new_divergence);
binaries.push(bin_values);
cur_prefix_ref = &(prefixes[prefixes.len()-1]);
cur_divergence_ref = &divergences[divergences.len()-1];
count_vec.push(zero_tot);
occ_vec.push(occ_positions);
is_pbwt_col.push(1);
pbwt_positions.push(col as u32);
}
j += 1;
}
let elapsed = now.elapsed();
println!("Calc Time: {:.4?}", elapsed);
println!("First: {:?}", &elapse1[500..530]);
println!("Second: {:?}", &elapse2[500..530]);
println!("Third: {:?}", &elapse3[500..530]);
}
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
data.push(choices);
}
let fm = 2;
spaced_pbwt(&data,fm);
}
Upvotes: 2
Views: 245
Reputation: 485
When I ran your code (on an i7-7700HQ
), I got these numbers
First: [16ns, 17ns, 16ns, 16ns, 16ns, 17ns, 15ns, 15ns, 16ns, 17ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 16ns, 15ns, 16ns, 15ns, 16ns, 16ns, 17ns, 16ns, 17ns, 16ns, 15ns, 16ns, 16ns, 16ns]
Second: [107ns, 104ns, 171ns, 109ns, 101ns, 112ns, 116ns, 169ns, 184ns, 177ns, 103ns, 108ns, 105ns, 79ns, 110ns, 112ns, 109ns, 165ns, 157ns, 104ns, 104ns, 409ns, 104ns, 107ns, 111ns, 104ns, 104ns, 104ns, 106ns, 117ns]
Third: [132ns, 126ns, 202ns, 132ns, 133ns, 140ns, 147ns, 197ns, 216ns, 207ns, 136ns, 138ns, 405ns, 105ns, 149ns, 139ns, 142ns, 198ns, 182ns, 126ns, 135ns, 434ns, 128ns, 136ns, 136ns, 127ns, 128ns, 129ns, 136ns, 147ns]
Which has vastly different proportions, than your results. Since you said, there is a C program that runs faster, it should not be a problem with your system.
The next thing I can think about is you need a cargo clean
and recompile the whole thing. Sometimes (I am on the nightly compiler) I had an issue, that made recompiled binaries slow, maybe because of some code-layout issue, compiler stuff idk. A clean build usually fixed it.
Next, you can try using link time optimization. Add this to your Cargo.toml
:
[profile.lto]
inherits = "release"
lto = true
Then run the profile with
cargo run --profile lto
Third, use a single array, like some comments said. The ndarray
crate is perfect for this. For me it brings down the times to
First: [18ns, 16ns, 17ns, 16ns, 17ns, 16ns, 18ns, 17ns, 17ns, 17ns, 17ns, 17ns, 17ns, 25ns, 16ns, 17ns, 18ns, 18ns, 17ns, 17ns, 18ns, 17ns, 17ns, 16ns, 17ns, 16ns, 16ns, 17ns, 17ns, 18ns]
Second: [51ns, 49ns, 48ns, 50ns, 51ns, 51ns, 49ns, 48ns, 48ns, 49ns, 50ns, 48ns, 53ns, 66ns, 49ns, 53ns, 52ns, 50ns, 50ns, 49ns, 53ns, 51ns, 47ns, 50ns, 52ns, 50ns, 48ns, 48ns, 48ns, 50ns]
Third: [77ns, 77ns, 75ns, 74ns, 83ns, 81ns, 75ns, 72ns, 81ns, 74ns, 82ns, 79ns, 552ns, 99ns, 81ns, 76ns, 79ns, 74ns, 77ns, 73ns, 86ns, 76ns, 75ns, 80ns, 85ns, 75ns, 74ns, 73ns, 74ns, 76ns]
use ndarray::Array2;
use std::collections::HashSet;
pub fn spaced_pbwt(data: &Array2<u8>, fm_gap: u32) -> () {
let now = std::time::Instant::now();
let (m, n) = data.dim();
let half_n = n/2;
...
unsafe {
val = *data.uget((idx as usize, j));
}
...
}
fn main() {
let m = 4000;
let n = 50000;
let step: Uniform<u8> = Uniform::new(0,2);
let mut rng = rand::thread_rng();
let mut data = Vec::new();
let mut data2 = Vec::with_capacity(m*n);
for _ in 0..m {
let choices: Vec<u8> = step.sample_iter(&mut rng).take(n).collect();
data2.extend_from_slice(&choices);
data.push(choices);
}
let fm = 2;
spaced_pbwt(&Array2::from_shape_vec((m, n), data2).unwrap(),fm);
//spaced_pbwt_vecs(&data,fm);
}
Upvotes: 1