how to scrape data with respect to date

Question

I have created a web scraper which scrapes data from a website. Problem is that from that website we can see the data of current day and also data of the whole financial year. What my scraper do is fetch only the data of the current day. I cannot bring the data of the whole year. For example, if I want to retrieve data of '01-July-2015' till today so my scraper can only fetch the data of current day data.
link: http://www.nccpl.com.pk/market-information/fipi-lipi/fipi below is the screen which have the data to be fetched. Below is my code

package nccpl_fipi_yearly;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class Nccpl_fipi_yearly {
    boolean writeCSVToConsole = true;
    boolean writeCSVToFile = true;
    boolean sortTheList = true;
    boolean writeToConsole;
    boolean writeToFile;
    public static Document doc = null;
    public static Elements tbodyElements = null;
    public static Elements elements = null;
    public static Elements tdElements = null;
    public static Elements trElement2 = null;
    public static ArrayList sampleList = new ArrayList();
    static int i = 0;

    public static void createConnection() throws IOException {
        System.setProperty("http.proxyHost", "191.1.1.202");
        System.setProperty("http.proxyPort", "8080");
        String tempUrl = "http://www.nccpl.com.pk/market-information/fipi-lipi/fipi";
        doc = Jsoup.connect(tempUrl).timeout(10000).get();
        System.out.println("Successfully Connected");
    }

    public static void parsingHTML() throws Exception {
        File fold = new File("D:\KSE\NCCPL-YEARLY.csv");
        fold.delete();
        File fnew = new File("D:\KSE\NCCPL-YEARLY.csv");
        for (Element table : doc.getElementsByClass("table")) {

            for (Element trElement : table.getElementsByTag("tr")) {

                trElement2 = trElement.getElementsByTag("tr");
                tdElements = trElement.getElementsByTag("td");
                FileWriter sb = new FileWriter(fnew, true);

                //if (table.hasClass("marketData")) { //&&(tdElements.hasClass("tableHead")&&tdElements.hasClass("tableSubHead"))
                for (Iterator it = tdElements.iterator(); it.hasNext();) {
                    if (it.hasNext()&& i>0) {
                        ///sb.append(" | ");
                        sb.append(" 
 ");
                    }

                    for (Iterator it2 = tdElements.iterator(); it.hasNext();) {
                        Element tdElement2 = it.next();
                        final String content = tdElement2.text().replace(",", "");
                        if (it2.hasNext()) {

                            sb.append(formatData(content));
                            sb.append("   |   ");

                        }

                    }

                    System.out.println(sb.toString());
                    sb.flush();
                    sb.close();
                    i++;
                }

                System.out.println(sampleList.add(tdElements));


            }

        }
    }
    private static final SimpleDateFormat FORMATTER_MMM_d_yyyy = new SimpleDateFormat("MMM/dd hh:mm", Locale.US);
    private static final SimpleDateFormat FORMATTER_dd_MMM_yyyy = new SimpleDateFormat("MMM-dd hh:mm", Locale.US);

    public static String formatData(String text) {
        String tmp = null;

        try {
            Date d = FORMATTER_MMM_d_yyyy.parse(text);
            tmp = FORMATTER_dd_MMM_yyyy.format(d);
        } catch (ParseException pe) {
            tmp = text;
        }

        return tmp;
    }

    public static void main(String[] args) throws IOException, Exception {
        createConnection();
        parsingHTML();

    }
}

Davide Pastore · Accepted Answer

Just edit the way Jsoup connects, using the search form in the same page:

doc = Jsoup.connect(tempUrl)
  .data("fromDate", "17/06/2015")
  .data("toDate", "17/06/2016")
  .data("type", "101")
  .data("search", "search")
  .timeout(10000)
  .post();

how to scrape data with respect to date

Answers (1)

Related Questions