Dylan Stephens
Dylan Stephens

Reputation: 1

NCBI Rentrez timeout HTTP failure 400

I am running a loop through Biosample IDs to store their metadata in a dataframe for about 15,000 IDs. I have the formatting for how I want to retrieve and store the XML data into an R dataframe but each time that I try to run the loop it times out after about 300 records. I have tried changing the httr timeout settings to a really large number of seconds but it only increases the amount of records slightly. Does anyone have any idea how to get around this? Should I increase the timeout elsewhere in http or curl? I unfortunately can't use a webhistory link since I have already saved my IDs through running a loop through entrez link.

library(rentrez)
library(XML)# packages
library(dplyr)

# functions
coacross <- function(...) {
  coalesce(!!!across(...))
}

query_soil_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "soil metagenome"[orgn:__txid410658]'
query_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "metagenome"[orgn:__txid256318]'
query_plant_rhizosphere <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "rhizosphere metagenome"[orgn:__txid939928]'
query_plant_metagenome <- '((((((("soil") AND "wgs"[Strategy]) NOT "amplicon"[Strategy]) AND "illumina"[Platform])) NOT amplicon)) AND "rhizosphere metagenome"[orgn:__txid939928]'

soil_meta_srch <- entrez_search(db="sra", term = query_soil_metagenome, retmax=90000, use_history = T)

meta_srch <- entrez_search(db="sra", term = query_metagenome, retmax=90000)

plant_rhizo_srch <- entrez_search(db="sra", term = query_plant_rhizosphere, retmax=90000)

plant_meta_srch <- entrez_search(db="sra", term = query_plant_metagenome, retmax=90000)

all_ids <- c(soil_meta_srch$ids, meta_srch$ids, plant_rhizo_srch$ids, plant_meta_srch$ids)

# for loop for linking
all = c()
for (i in all_ids[1:1000]){
  temp = entrez_link(dbfrom='sra', id = i, db='biosample', by_id = T)
  temp = temp$links$sra_biosample
  all <- append(all, temp)
}


records = data.frame(ID = NA, accession_sra = NA, access = NA)
c = 1
for (i in all){
  temp = entrez_fetch(db="biosample", id = i, rettype = "xml")
  temp_2 <- read_xml(temp)
  temp_xml = xmlToList(temp)
  
  ID <- i
  accession_sra <- temp_xml$BioSample$.attrs["accession"]
  access <- temp_xml$BioSample$.attrs["access"]

  records[c, "ID"] = ID
  records[c, "accession_sra"] = accession_sra
  records[c, "access"] = access
  
  # unlist and store in df attributes
  unnest <- lapply(temp_xml$BioSample$Attributes, function(x) do.call(rbind, x))
  temp_df_unnested <- data.frame(matrix(NA,    # Create empty data frame
                          nrow = 1,
                          ncol = length(unnest)))

  for (i in 1:length(unnest)){
    df <- data.frame(unnest[i])
    store <- df[1,1]
    col_info <- df[2,1]
    colnames(temp_df_unnested)[i] <- col_info
    temp_df_unnested[,i] <- store
  }

  # take all words and make them lowercase and add a hyphen instead of a space
  colnames(temp_df_unnested) <- tolower(colnames(temp_df_unnested))
  colnames(temp_df_unnested) <- sub(" ", "_", colnames(temp_df_unnested))


  # access, ID and accession_sra work the rest under attributes
#  records <- merge(records, temp_df_unnested, by = 0)
  records <- dplyr::bind_rows(records, temp_df_unnested)

  
  c = c + 1
}

combine_cols <- records %>%
  transmute(ena_first_public = coacross(starts_with("ena-first-public")),
            ena_last_update = coacross(contains("update")),
            lat = coacross(contains("latitude")),
            lon = coacross(contains("longitude")))

combine_cols$ID <- records$ID

records <- left_join(records, combine_cols, by = "ID") %>%
  select(-contains("ena-first-public"),
         -contains("update"),
         -lat_lon)

I am expecting to run through my loop my list of accession ID numbers and retrieve biosample information in a dataframe format.

Upvotes: 0

Views: 138

Answers (0)

Related Questions