Getting element from a `Vec` very slow

Question

I'm running the following snippet of code (the std::time objects are just there for benchmarking) that gets u8 elements from a vector of vector of u8 in a given order and creates a new vector with these objects in this order.

for idx in cur_prefix_ref.iter() {

     let now = std::time::Instant::now();

     let elapsed_first = now.elapsed();

     unsafe {
         val = *data.get_unchecked(*idx as usize).get_unchecked(j);
     }

     let elapsed_second = now.elapsed();

     new_add.push(val);

    if val == 0 {
        zero_tot += 1;
    } else if val == 1 {
        one_tot += 1;
    }

    if (ct == 0) || (ct_extra == fm_gap) {
        occ_positions[0].push(zero_tot);
        occ_positions[1].push(one_tot);
        ct_extra = 0;
   
    }

    ct += 1;
    ct_extra += 1;

    let elapsed_third = now.elapsed();

    elapse1.push(elapsed_first);
    elapse2.push(elapsed_second);
    elapse3.push(elapsed_third);
}

In my full code this inner loop ends up running hundreds of millions of times so I'm trying to optimise it as much as possible. According to be benchmarking, I seem to be spending the vast majority of the loop time in looking up values from my Vec>, on the line val = *data.get_unchecked(*idx as usize).get_unchecked(j);, see below which benchmarks some elapsed_first,elapsed_second,elapsed_third times from different iterations of this loop (the i^th element of each list is from the same run):

First: [27ns, 23ns, 21ns, 24ns, 27ns, 23ns, 28ns, 23ns, 26ns, 23ns, 21ns, 22ns, 27ns, 27ns, 28ns, 23ns, 25ns, 24ns, 26ns, 25ns, 22ns, 24ns, 24ns, 28ns, 28ns, 28ns, 26ns, 22ns, 22ns, 21ns]

Second: [538ns, 695ns, 550ns, 486ns, 627ns, 615ns, 562ns, 570ns, 661ns, 521ns, 617ns, 358ns, 444ns, 560ns, 540ns, 471ns, 656ns, 336ns, 233ns, 209ns, 433ns, 373ns, 1.427µs, 542ns, 708ns, 288ns, 304ns, 608ns, 297ns, 252ns]

Third: [612ns, 736ns, 587ns, 525ns, 665ns, 658ns, 608ns, 614ns, 701ns, 560ns, 656ns, 395ns, 482ns, 606ns, 578ns, 510ns, 696ns, 374ns, 270ns, 246ns, 470ns, 416ns, 1.47µs, 583ns, 751ns, 327ns, 348ns, 645ns, 334ns, 289ns]

I've been trying to understand why this simple vector lookup is the bit that takes by far the most time compared to everything else and still haven't figured it out. Any help is much appreciated!

EDIT: Here is the full function which this code comes from:

pub fn spaced_pbwt(vcf: &VCFData, pbwt_cols: &Vec, fm_gap: u32) -> SpacedPbwt {

    let now = std::time::Instant::now();

    let data_positions: Vec = vcf.positions.clone();
    let mut pbwt_positions: Vec = Vec::new();
    let mut insert_positions: Vec = Vec::new();
    let data: &Vec> = &vcf.vcf_data;

    let mut col_set: HashSet = HashSet::new();

    let mut n: usize = 0;

    for col in pbwt_cols {
        let pos = col.position;
        col_set.insert(pos);
        n += 1;
    }

    let m = data.len();
    let n_full = data[0].len();

    let n_other = n_full-n;

    let mut is_pbwt_col :Vec = Vec::with_capacity(n_full+1);
    let mut pbwt_positions: Vec = Vec::new();
    let mut inserted_positions: Vec = Vec::new();
    let mut prefixes : Vec> = Vec::with_capacity(n+1);
    let mut divergences : Vec> = Vec::with_capacity(n+1);
    let mut binaries: Vec> = Vec::with_capacity(n_full+1);


    let cur_prefix : Vec = Vec::from_iter(0..m as u32);
    let cur_divergence : Vec = vec![0; m];
    let mut j: usize = 0;
    let mut j_pbwt = 0;

    let mut count_vec: Vec = Vec::new();
    let mut occ_vec : Vec>> = Vec::new();

    prefixes.push(cur_prefix);
    divergences.push(cur_divergence);

    let mut cur_prefix_ref: &Vec = &(prefixes[prefixes.len()-1]);
    let mut cur_divergence_ref: &Vec = &divergences[divergences.len()-1];

    let mut ct: i32 = 0;
    let mut ct_extra: u32 = 0;
    let mut zero_tot: u32 = 0;
    let mut one_tot: u32 = 0;
    let mut occ_positions: Vec> = vec![Vec::new(),Vec::new()];
    let mut new_add: Vec = Vec::with_capacity(m);

    let mut a: Vec = Vec::with_capacity(m);
    let mut b: Vec = Vec::with_capacity(m);
    let mut d: Vec = Vec::with_capacity(m);
    let mut e: Vec = Vec::with_capacity(m);

    let mut bin_values: Vec = Vec::with_capacity(m);

    let mut elapse1 = Vec::new();
    let mut elapse2 = Vec::new();
    let mut elapse3 = Vec::new();

    for col in &vcf.positions {
        if !col_set.contains(&col) {


            ct = 0;
            ct_extra = 0;
            zero_tot = 0;
            one_tot = 0;
            occ_positions = vec![Vec::new(),Vec::new()];

            new_add = Vec::with_capacity(m);

            let mut val: u8;
            for idx in cur_prefix_ref.iter() {

                let now = std::time::Instant::now();

                let elapsed_first = now.elapsed();

                unsafe {
                val = *data.get_unchecked(*idx as usize).get_unchecked(j);
                }

                let elapsed_second = now.elapsed();

                new_add.push(val);

                if val == 0 {
                    zero_tot += 1;
                } else if val == 1 {
                    one_tot += 1;
                }

                if (ct == 0) || (ct_extra == fm_gap) {
                    occ_positions[0].push(zero_tot);
                    occ_positions[1].push(one_tot);
                    ct_extra = 0;
    
                }

                ct += 1;
                ct_extra += 1;

                let elapsed_third = now.elapsed();

                elapse1.push(elapsed_first);
                elapse2.push(elapsed_second);
                elapse3.push(elapsed_third);
            }

            binaries.push(new_add);
            is_pbwt_col.push(0);
            inserted_positions.push(*col);
            count_vec.push(zero_tot);
            occ_vec.push(occ_positions);


        } else {


            a = Vec::with_capacity(m);
            b = Vec::with_capacity(m);
            d = Vec::with_capacity(m);
            e = Vec::with_capacity(m);

            bin_values = Vec::with_capacity(m);

            let mut p: u32 = j_pbwt+1;
            let mut q: u32 = j_pbwt+1;

            occ_positions = vec![Vec::new(),Vec::new()];

            ct = 0;
            ct_extra = 0;
            zero_tot = 0;
            one_tot = 0;

            let mut cur_allele: u8;
            for (idx,start_point) in
            cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {

                let idx_val = *idx;

                unsafe {
                cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
                }


                bin_values.push(cur_allele);

                let st = *start_point;

                if st > p {
                p = st;
                }

                if st > q {
                q = st;
                }

                if cur_allele == 0 {
                    a.push(idx_val);
                    d.push(p);
                    p = 0;

                    zero_tot += 1;
                }
    
                if cur_allele == 1 {
                    b.push(idx_val);
                    e.push(q);
                    q = 0;
    
                    one_tot += 1;
                }

                if (ct == 0) || (ct_extra == fm_gap) {
                    occ_positions[0].push(zero_tot);
                    occ_positions[1].push(one_tot);
                    ct_extra = 0;
    
                }
                ct += 1;
                ct_extra += 1;
            
            }  

            
            let mut new_prefix = a;
            new_prefix.append(&mut b);
            let mut new_divergence = d;
            new_divergence.append(&mut e);


            prefixes.push(new_prefix);
            divergences.push(new_divergence);
            binaries.push(bin_values);

            cur_prefix_ref = &(prefixes[prefixes.len()-1]);
            cur_divergence_ref = &divergences[divergences.len()-1];

            count_vec.push(zero_tot);
            occ_vec.push(occ_positions);

            is_pbwt_col.push(1);
            pbwt_positions.push(*col);

        }
        j += 1;

    }

    let elapsed = now.elapsed();

    println!("Calc Time: {:.4?}", elapsed);

    println!("First: {:?}", &elapse1[500..530]);
    println!("Second: {:?}", &elapse2[500..530]);
    println!("Third: {:?}", &elapse3[500..530]);


    return SpacedPbwt {
        num_samples: m as u32,
        num_pbwt_sites: n as u32,
        num_inserted_sites: n_other as u32,
        num_total_sites: n_full as u32,

        pbwt_positions: pbwt_positions,
        inserted_positions: inserted_positions,
        all_positions: data_positions,
        pbwt_col_flags: is_pbwt_col,

        bin_pbwt: binaries,
        count: count_vec,
        occ_list: occ_vec,
        fm_gap: fm_gap,
        
    };
}

EDIT EDIT:

Here is a modified version of the file that everybody should be able to run on their machine and does exhibit the behaviour I'm concerned about. It only uses the rand crate as a dependency:

use rand::{seq::IteratorRandom, thread_rng}; // 0.6.1
use rand::distributions::{Distribution, Uniform};

use std::collections::HashSet;

pub fn spaced_pbwt(data: &Vec>, fm_gap: u32) -> () {

    let now = std::time::Instant::now();


    let m = data.len();
    let n = data[0].len();
    let half_n = n/2;
    
    let mut rng = thread_rng();
    let sample: Vec = (0u32..n as u32).collect();
    let perm = sample.iter().choose_multiple(&mut rng, half_n);

    let mut cols_to_permute: Vec = Vec::new();

    for i in perm {
        cols_to_permute.push(*i);
    }
    

    let mut col_set: HashSet = HashSet::new();

    let mut n: usize = 0;

    for col in &cols_to_permute {
        col_set.insert(*col);
        n += 1;
    }

    let m = data.len();
    let n_full = data[0].len();

    let n_other = n_full-n;

    let mut is_pbwt_col :Vec = Vec::with_capacity(n_full+1);
    let mut pbwt_positions: Vec = Vec::new();
    let mut inserted_positions: Vec = Vec::new();
    let mut prefixes : Vec> = Vec::with_capacity(n+1);
    let mut divergences : Vec> = Vec::with_capacity(n+1);
    let mut binaries: Vec> = Vec::with_capacity(n_full+1);


    let cur_prefix : Vec = Vec::from_iter(0..m as u32);
    let cur_divergence : Vec = vec![0; m];
    let mut j: usize = 0;
    let mut j_pbwt = 0;

    let mut count_vec: Vec = Vec::new();
    let mut occ_vec : Vec>> = Vec::new();

    prefixes.push(cur_prefix);
    divergences.push(cur_divergence);

    let mut cur_prefix_ref: &Vec = &(prefixes[prefixes.len()-1]);
    let mut cur_divergence_ref: &Vec = &divergences[divergences.len()-1];

    let mut ct: i32 = 0;
    let mut ct_extra: u32 = 0;
    let mut zero_tot: u32 = 0;
    let mut one_tot: u32 = 0;
    let mut occ_positions: Vec> = vec![Vec::new(),Vec::new()];
    let mut new_add: Vec = Vec::with_capacity(m);

    let mut a: Vec = Vec::with_capacity(m);
    let mut b: Vec = Vec::with_capacity(m);
    let mut d: Vec = Vec::with_capacity(m);
    let mut e: Vec = Vec::with_capacity(m);

    let mut bin_values: Vec = Vec::with_capacity(m);

    let mut elapse1 = Vec::new();
    let mut elapse2 = Vec::new();
    let mut elapse3 = Vec::new();

    for col in 0..n {
        if !col_set.contains(&(col as u32)) {


            ct = 0;
            ct_extra = 0;
            zero_tot = 0;
            one_tot = 0;
            occ_positions = vec![Vec::new(),Vec::new()];

            new_add = Vec::with_capacity(m);

            let mut val: u8;
            for idx in cur_prefix_ref.iter() {

                let now = std::time::Instant::now();

                let elapsed_first = now.elapsed();

                unsafe {
                val = *data.get_unchecked(*idx as usize).get_unchecked(j);
                }

                let elapsed_second = now.elapsed();

                new_add.push(val);

                if val == 0 {
                    zero_tot += 1;
                } else if val == 1 {
                    one_tot += 1;
                }

                if (ct == 0) || (ct_extra == fm_gap) {
                    occ_positions[0].push(zero_tot);
                    occ_positions[1].push(one_tot);
                    ct_extra = 0;
    
                }

                ct += 1;
                ct_extra += 1;

                let elapsed_third = now.elapsed();

                elapse1.push(elapsed_first);
                elapse2.push(elapsed_second);
                elapse3.push(elapsed_third);
            }

            binaries.push(new_add);
            is_pbwt_col.push(0);
            inserted_positions.push(col as u32);
            count_vec.push(zero_tot);
            occ_vec.push(occ_positions);


        } else {


            a = Vec::with_capacity(m);
            b = Vec::with_capacity(m);
            d = Vec::with_capacity(m);
            e = Vec::with_capacity(m);

            bin_values = Vec::with_capacity(m);

            let mut p: u32 = j_pbwt+1;
            let mut q: u32 = j_pbwt+1;

            occ_positions = vec![Vec::new(),Vec::new()];

            ct = 0;
            ct_extra = 0;
            zero_tot = 0;
            one_tot = 0;

            let mut cur_allele: u8;
            for (idx,start_point) in
            cur_prefix_ref.iter().zip(cur_divergence_ref.iter()) {

                let idx_val = *idx;

                unsafe {
                cur_allele = *data.get_unchecked(idx_val as usize).get_unchecked(j);
                }


                bin_values.push(cur_allele);

                let st = *start_point;

                if st > p {
                p = st;
                }

                if st > q {
                q = st;
                }

                if cur_allele == 0 {
                    a.push(idx_val);
                    d.push(p);
                    p = 0;

                    zero_tot += 1;
                }
    
                if cur_allele == 1 {
                    b.push(idx_val);
                    e.push(q);
                    q = 0;
    
                    one_tot += 1;
                }

                if (ct == 0) || (ct_extra == fm_gap) {
                    occ_positions[0].push(zero_tot);
                    occ_positions[1].push(one_tot);
                    ct_extra = 0;
    
                }
                ct += 1;
                ct_extra += 1;
            
            }  

            
            let mut new_prefix = a;
            new_prefix.append(&mut b);
            let mut new_divergence = d;
            new_divergence.append(&mut e);


            prefixes.push(new_prefix);
            divergences.push(new_divergence);
            binaries.push(bin_values);

            cur_prefix_ref = &(prefixes[prefixes.len()-1]);
            cur_divergence_ref = &divergences[divergences.len()-1];

            count_vec.push(zero_tot);
            occ_vec.push(occ_positions);

            is_pbwt_col.push(1);
            pbwt_positions.push(col as u32);

        }
        j += 1;

    }

    let elapsed = now.elapsed();

    println!("Calc Time: {:.4?}", elapsed);

    println!("First: {:?}", &elapse1[500..530]);
    println!("Second: {:?}", &elapse2[500..530]);
    println!("Third: {:?}", &elapse3[500..530]);

}


fn main() {

    let m = 4000;
    let n = 50000;

    let step: Uniform = Uniform::new(0,2);
    let mut rng = rand::thread_rng();
    let mut data = Vec::new();

    for _ in 0..m {
        let choices: Vec = step.sample_iter(&mut rng).take(n).collect();
        data.push(choices);
    }
    
    let fm = 2;
    spaced_pbwt(&data,fm);
}

Getting element from a `Vec<Vec<u8>>` very slow

Answers (1)

Related Questions

Getting element from a `Vec&lt;Vec&lt;u8&gt;&gt;` very slow

Answers (1)

Related Questions

Getting element from a `Vec<Vec<u8>>` very slow