Reputation: 306
I have created a web scraper which scrapes data from a website. Problem is that from that website we can see the data of current day and also data of the whole financial year. What my scraper do is fetch only the data of the current day. I cannot bring the data of the whole year. For example, if I want to retrieve data of '01-July-2015' till today so my scraper can only fetch the data of current day data.
link: http://www.nccpl.com.pk/market-information/fipi-lipi/fipi
below is the screen which have the data to be fetched.
Below is my code
package nccpl_fipi_yearly;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class Nccpl_fipi_yearly {
boolean writeCSVToConsole = true;
boolean writeCSVToFile = true;
boolean sortTheList = true;
boolean writeToConsole;
boolean writeToFile;
public static Document doc = null;
public static Elements tbodyElements = null;
public static Elements elements = null;
public static Elements tdElements = null;
public static Elements trElement2 = null;
public static ArrayList<Elements> sampleList = new ArrayList<Elements>();
static int i = 0;
public static void createConnection() throws IOException {
System.setProperty("http.proxyHost", "191.1.1.202");
System.setProperty("http.proxyPort", "8080");
String tempUrl = "http://www.nccpl.com.pk/market-information/fipi-lipi/fipi";
doc = Jsoup.connect(tempUrl).timeout(10000).get();
System.out.println("Successfully Connected");
}
public static void parsingHTML() throws Exception {
File fold = new File("D:\\KSE\\NCCPL-YEARLY.csv");
fold.delete();
File fnew = new File("D:\\KSE\\NCCPL-YEARLY.csv");
for (Element table : doc.getElementsByClass("table")) {
for (Element trElement : table.getElementsByTag("tr")) {
trElement2 = trElement.getElementsByTag("tr");
tdElements = trElement.getElementsByTag("td");
FileWriter sb = new FileWriter(fnew, true);
//if (table.hasClass("marketData")) { //&&(tdElements.hasClass("tableHead")&&tdElements.hasClass("tableSubHead"))
for (Iterator<Element> it = tdElements.iterator(); it.hasNext();) {
if (it.hasNext()&& i>0) {
///sb.append(" | ");
sb.append(" \r\n ");
}
for (Iterator<Element> it2 = tdElements.iterator(); it.hasNext();) {
Element tdElement2 = it.next();
final String content = tdElement2.text().replace(",", "");
if (it2.hasNext()) {
sb.append(formatData(content));
sb.append(" | ");
}
}
System.out.println(sb.toString());
sb.flush();
sb.close();
i++;
}
System.out.println(sampleList.add(tdElements));
}
}
}
private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM/dd hh:mm", Locale.US);
private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("MMM-dd hh:mm", Locale.US);
public static String formatData(String text) {
String tmp = null;
try {
Date d = FORMATTER_MMM_d_yyyy.parse(text);
tmp = FORMATTER_dd_MMM_yyyy.format(d);
} catch (ParseException pe) {
tmp = text;
}
return tmp;
}
public static void main(String[] args) throws IOException, Exception {
createConnection();
parsingHTML();
}
}
Upvotes: 0
Views: 489
Reputation: 8738
Just edit the way Jsoup connects, using the search form in the same page:
doc = Jsoup.connect(tempUrl)
.data("fromDate", "17/06/2015")
.data("toDate", "17/06/2016")
.data("type", "101")
.data("search", "search")
.timeout(10000)
.post();
Upvotes: 2