Reputation: 1
I am running a loop through Biosample IDs to store their metadata in a dataframe for about 15,000 IDs. I have the formatting for how I want to retrieve and store the XML data into an R dataframe but each time that I try to run the loop it times out after about 300 records. I have tried changing the httr timeout settings to a really large number of seconds but it only increases the amount of records slightly. Does anyone have any idea how to get around this? Should I increase the timeout elsewhere in http or curl? I unfortunately can't use a webhistory link since I have already saved my IDs through running a loop through entrez link.
library(rentrez)
library(XML)# packages
library(dplyr)
# functions
coacross <- function(...) {
coalesce(!!!across(...))
}
query_soil_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "soil metagenome"[orgn:__txid410658]'
query_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "metagenome"[orgn:__txid256318]'
query_plant_rhizosphere <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "rhizosphere metagenome"[orgn:__txid939928]'
query_plant_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "rhizosphere metagenome"[orgn:__txid939928]'
soil_meta_srch <- entrez_search(db="sra", term = query_soil_metagenome, retmax=90000, use_history = T)
meta_srch <- entrez_search(db="sra", term = query_metagenome, retmax=90000)
plant_rhizo_srch <- entrez_search(db="sra", term = query_plant_rhizosphere, retmax=90000)
plant_meta_srch <- entrez_search(db="sra", term = query_plant_metagenome, retmax=90000)
all_ids <- c(soil_meta_srch$ids, meta_srch$ids, plant_rhizo_srch$ids, plant_meta_srch$ids)
# for loop for linking
all = c()
for (i in all_ids[1:1000]){
temp = entrez_link(dbfrom='sra', id = i, db='biosample', by_id = T)
temp = temp$links$sra_biosample
all <- append(all, temp)
}
records = data.frame(ID = NA, accession_sra = NA, access = NA)
c = 1
for (i in all){
temp = entrez_fetch(db="biosample", id = i, rettype = "xml")
temp_2 <- read_xml(temp)
temp_xml = xmlToList(temp)
ID <- i
accession_sra <- temp_xml$BioSample$.attrs["accession"]
access <- temp_xml$BioSample$.attrs["access"]
records[c, "ID"] = ID
records[c, "accession_sra"] = accession_sra
records[c, "access"] = access
# unlist and store in df attributes
unnest <- lapply(temp_xml$BioSample$Attributes, function(x) do.call(rbind, x))
temp_df_unnested <- data.frame(matrix(NA, # Create empty data frame
nrow = 1,
ncol = length(unnest)))
for (i in 1:length(unnest)){
df <- data.frame(unnest[i])
store <- df[1,1]
col_info <- df[2,1]
colnames(temp_df_unnested)[i] <- col_info
temp_df_unnested[,i] <- store
}
# take all words and make them lowercase and add a hyphen instead of a space
colnames(temp_df_unnested) <- tolower(colnames(temp_df_unnested))
colnames(temp_df_unnested) <- sub(" ", "_", colnames(temp_df_unnested))
# access, ID and accession_sra work the rest under attributes
# records <- merge(records, temp_df_unnested, by = 0)
records <- dplyr::bind_rows(records, temp_df_unnested)
c = c + 1
}
combine_cols <- records %>%
transmute(ena_first_public = coacross(starts_with("ena-first-public")),
ena_last_update = coacross(contains("update")),
lat = coacross(contains("latitude")),
lon = coacross(contains("longitude")))
combine_cols$ID <- records$ID
records <- left_join(records, combine_cols, by = "ID") %>%
select(-contains("ena-first-public"),
-contains("update"),
-lat_lon)
I am expecting to run through my loop my list of accession ID numbers and retrieve biosample information in a dataframe format.
Upvotes: 0
Views: 138