sahuno
sahuno

Reputation: 483

grep not giving specific results in R

I want to filter paths matching only some values ( "chr1" "chr11" "chr16" "chr17" "chr2" "chr5" "chr6" "chr7") in a list of paths. However my results includes additional chr#

This is the items i want to filter

> sort(chrm_to_filter$chr)
 "chr1"  "chr11" "chr16" "chr17" "chr2"  "chr5"  "chr6"  "chr7" 

My data looks something like this

print(path_per_chr_tabix)
"/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr10.bgz"
"/path_to_file/merged_modified_per_base_calling.chr11.bgz"
"/path_to_file/merged_modified_per_base_calling.chr12.bgz"
"/path_to_file/merged_modified_per_base_calling.chr13.bgz"
"/path_to_file/merged_modified_per_base_calling.chr14.bgz"
"/path_to_file/merged_modified_per_base_calling.chr15.bgz"
"/path_to_file/merged_modified_per_base_calling.chr16.bgz"
"/path_to_file/merged_modified_per_base_calling.chr17.bgz"
"/path_to_file/merged_modified_per_base_calling.chr18.bgz"
"/path_to_file/merged_modified_per_base_calling.chr19.bgz"
"/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr3.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr4.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr7.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr8.bgz" 
"/path_to_file/merged_modified_per_base_calling.chr9.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrm.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrX.bgz" 
"/path_to_file/merged_modified_per_base_calling.chrY.bgz" 

#find which data to load to save memory

subset_tabix_paths_to_load <- path_per_chr_tabix[
    grep(
        paste0(sort(chrm_to_filter$chr), collapse="|"), 
        path_per_chr_tabix)
    ]
message("these are the files we will be workign with for now- ")
print(subset_tabix_paths_to_load)

"/paths/merged_modified_per_base_calling.chr1.bgz" 
"/paths/merged_modified_per_base_calling.chr10.bgz"
"/paths/merged_modified_per_base_calling.chr11.bgz"
"/paths/merged_modified_per_base_calling.chr12.bgz"
"/paths/merged_modified_per_base_calling.chr13.bgz"
"/paths/merged_modified_per_base_calling.chr14.bgz"
"/paths/merged_modified_per_base_calling.chr15.bgz"
"/paths/merged_modified_per_base_calling.chr16.bgz"
"/paths/merged_modified_per_base_calling.chr17.bgz"
"/paths/merged_modified_per_base_calling.chr18.bgz"
"/paths/merged_modified_per_base_calling.chr19.bgz"
"/paths/merged_modified_per_base_calling.chr2.bgz" 
"/paths/merged_modified_per_base_calling.chr5.bgz" 
"/paths/merged_modified_per_base_calling.chr6.bgz" 
"/paths/merged_modified_per_base_calling.chr7.bgz" 

Upvotes: 1

Views: 79

Answers (3)

Abdur Rohman
Abdur Rohman

Reputation: 2944

lapply(sort(chrm_to_filter$chr), 
      function(chr) {
          path_per_chr_tabix[grep(paste0(chr,"."),
                     path_per_chr_tabix, 
                     fixed = TRUE)]
       })|> 
      unlist()

#[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
#[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
#[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
#[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
#[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
#[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
#[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
#[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz" 

Upvotes: 1

Jilber Urbina
Jilber Urbina

Reputation: 61154

Here's an approach using sub + which

> filter_chr <- c("chr1",  "chr11", "chr16", "chr17", "chr2",  "chr5",  "chr6",  "chr7") 
> string[which(sub(".*\\.(chr\\d+)\\..*$", "\\1", string) %in% filter_chr)]
[1] "/path_to_file/merged_modified_per_base_calling.chr1.bgz" 
[2] "/path_to_file/merged_modified_per_base_calling.chr11.bgz"
[3] "/path_to_file/merged_modified_per_base_calling.chr16.bgz"
[4] "/path_to_file/merged_modified_per_base_calling.chr17.bgz"
[5] "/path_to_file/merged_modified_per_base_calling.chr2.bgz" 
[6] "/path_to_file/merged_modified_per_base_calling.chr5.bgz" 
[7] "/path_to_file/merged_modified_per_base_calling.chr6.bgz" 
[8] "/path_to_file/merged_modified_per_base_calling.chr7.bgz" 

string has the following structure:

c("/path_to_file/merged_modified_per_base_calling.chr1.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr10.bgz", "/path_to_file/merged_modified_per_base_calling.chr11.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr12.bgz", "/path_to_file/merged_modified_per_base_calling.chr13.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr14.bgz", "/path_to_file/merged_modified_per_base_calling.chr15.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr16.bgz", "/path_to_file/merged_modified_per_base_calling.chr17.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr18.bgz", "/path_to_file/merged_modified_per_base_calling.chr19.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr2.bgz", "/path_to_file/merged_modified_per_base_calling.chr3.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr4.bgz", "/path_to_file/merged_modified_per_base_calling.chr5.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr6.bgz", "/path_to_file/merged_modified_per_base_calling.chr7.bgz", 
"/path_to_file/merged_modified_per_base_calling.chr8.bgz", "/path_to_file/merged_modified_per_base_calling.chr9.bgz", 
"/path_to_file/merged_modified_per_base_calling.chrm.bgz", "/path_to_file/merged_modified_per_base_calling.chrX.bgz", 
"/path_to_file/merged_modified_per_base_calling.chrY.bgz")

Upvotes: 2

Phenomniverse
Phenomniverse

Reputation: 329

The problem is that one of your search strings is 'chr1', which is found in 'chr1', 'chr10', 'chr11', etc. If you change your search strings to 'chr1.bgz' etc, it should work.

Upvotes: 4

Related Questions