Reputation: 61
With this code I am able to get the data from first page of this website. But I want to get the data from complete. I want to extract the data from all the webpages. After extraction of data it sholud be either save in excel or csv file.
install.packages("rvest")
library(rvest)
install.packages("dplyr")
library(dplyr)
pg<-read_html("https://bidplus.gem.gov.in/bidresultlists?bidresultlists&page_no=i")
#pg <- read_html("https://bidplus.gem.gov.in/bidresultlists")
blocks <- html_nodes(pg, ".block")
items_and_quantity <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Item(s)')]")
items <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Item(s)')]/following-sibling::span") %>% html_text(trim=TRUE)
quantity <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Quantity')]/following-sibling::span") %>% html_text(trim=TRUE) %>% as.numeric()
department_name_and_address <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Department Name And Address')]") %>%
html_text(trim=TRUE) %>%
gsub("\n", "|", .) %>%
gsub("[[:space:]]*\\||\\|[[:space:]]*", "|", .)
block_header <- html_nodes(blocks, "div.block_header")
html_nodes(block_header, xpath=".//p[contains(@class, 'bid_no')]") %>%
html_text(trim=TRUE) %>%
gsub("^.*: ", "", .) -> bid_no
html_nodes(block_header, xpath=".//p/b[contains(., 'Status')]/following-sibling::span") %>%
html_text(trim=TRUE) -> status
html_nodes(blocks, xpath=".//strong[contains(., 'Start Date')]/following-sibling::span") %>%
html_text(trim=TRUE) -> start_date
html_nodes(blocks, xpath=".//strong[contains(., 'End Date')]/following-sibling::span") %>%
html_text(trim=TRUE) -> end_date
data.frame(
bid_no,
status,
start_date,
end_date,
items,
quantity,
department_name_and_address,
stringsAsFactors=FALSE
) -> xdf
xdf$is_ra <- grepl("/RA/", bid_no)
str(xdf)
## 'data.frame': 10 obs. of 8 variables:
## $ bid_no : chr "GEM/2018/B/93066" "GEM/2018/B/93082" "GEM/2018/B/93105" "GEM/2018/B/93999" ...
## $ status : chr "Not Evaluated" "Not Evaluated" "Not Evaluated" "Not Evaluated" ...
## $ start_date : chr "25-09-2018 03:53:pm" "27-09-2018 09:16:am" "25-09-2018 05:08:pm" "26-09-2018 05:21:pm" ...
## $ end_date : chr "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" ...
## $ items : chr "automotive chassis fitted with engine" "automotive chassis fitted with engine" "automotive chassis fitted with engine" "Storage System" ...
## $ quantity : num 1 1 1 2 90 1 981 6 4 376
## $ department_name_and_address: chr "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Maharashtra Energy Department Maharashtra Bhusawal Tps N/a" ...
## $ is_ra : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
xdf
write.csv(xdf,'xdf1.csv')
write.csv(xdf,'xdf.csv')
write.csv(xdf,'xdf.csv', append = TRUE)
?write.csv
write.table( xdf1,
file="xdf.csv",
append = T,
sep=',',
row.names=F,
col.names=F )
Upvotes: 0
Views: 130
Reputation: 2707
try this one:
library(rvest)
library(tidyverse)
pg<-read_html("https://bidplus.gem.gov.in/bidresultlists?bidresultlists&page_no=1")
##Find total number of pages
page_num<-pg%>%
html_nodes(".pagination")%>%
html_nodes("li")%>%
html_nodes("a")%>%
.[5]%>%
html_attrs()%>%
unlist()%>%
parse_number()%>%unique()
#make function for scraping page
scr=function(i){
pg<-read_html(paste0("https://bidplus.gem.gov.in/bidresultlists?bidresultlists&page_no=",i))
blocks <- html_nodes(pg, ".block")
items_and_quantity <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Item(s)')]")
items <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Item(s)')]/following-sibling::span") %>% html_text(trim=TRUE)
quantity <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Quantity')]/following-sibling::span") %>% html_text(trim=TRUE) %>% as.numeric()
department_name_and_address <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Department Name And Address')]") %>%
html_text(trim=TRUE) %>%
gsub("\n", "|", .) %>%
gsub("[[:space:]]*\\||\\|[[:space:]]*", "|", .)
block_header <- html_nodes(blocks, "div.block_header")
html_nodes(block_header, xpath=".//p[contains(@class, 'bid_no')]") %>%
html_text(trim=TRUE) %>%
gsub("^.*: ", "", .) -> bid_no
html_nodes(block_header, xpath=".//p/b[contains(., 'Status')]/following-sibling::span") %>%
html_text(trim=TRUE) -> status
html_nodes(blocks, xpath=".//strong[contains(., 'Start Date')]/following-sibling::span") %>%
html_text(trim=TRUE) -> start_date
html_nodes(blocks, xpath=".//strong[contains(., 'End Date')]/following-sibling::span") %>%
html_text(trim=TRUE) -> end_date
data.frame(
bid_no,
status,
start_date,
end_date,
items,
quantity,
department_name_and_address,
stringsAsFactors=FALSE
) -> xdf
xdf$is_ra <- grepl("/RA/", bid_no)
return(xdf)
}
#run for-loop for each page and save it in data frame
res<-1:page_num%>%
map_df(.,scr)
#for example
1:2%>%
map_df(.,scr)%>%
head(5)
bid_no status start_date end_date items quantity
1 GEM/2018/B/94492 Not Evaluated 02-10-2018 10:42:am 22-10-2018 01:00:pm door frame metal detector dfmd security metal detector 1
2 GEM/2018/B/95678 Not Evaluated 29-09-2018 11:01:am 22-10-2018 01:00:pm Foolscap sheets 100
3 GEM/2018/B/96187 Not Evaluated 01-10-2018 10:29:am 22-10-2018 01:00:pm OEM Cartridge/ Consumable 20
4 GEM/2018/B/96196 Not Evaluated 01-10-2018 10:48:am 22-10-2018 01:00:pm OEM Cartridge/ Consumable 20
5 GEM/2018/B/96722 Technical Evaluation 01-10-2018 05:26:pm 22-10-2018 01:00:pm Special Purpose Telephones(smart phone for ICDS) 33914
department_name_and_address is_ra
1 Department Name And Address:||Ministry Of Shipping Na Electronics Directorate General Of Lighthouses And Lightships FALSE
2 Department Name And Address:||Ministry Of Defence Department Of Defence Cweafborjhar N/a FALSE
3 Department Name And Address:||Ministry Of Defence Department Of Defence Cweafborjhar N/a FALSE
4 Department Name And Address:||Ministry Of Defence Department Of Defence Cweafborjhar N/a FALSE
5 Department Name And Address:||Bihar Social Welfare Department Bihar Procurement N/a FALSE
Upvotes: 1