moth
moth

Reputation: 2409

how to get the contents of this web page in csv or tsv format with curl or wget

I have the following link:

http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=loadAircraftConditionsResultPage&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=

I would like to download all the tables in this database and transform into a csv or tsv file. Is there any curl or wget command that allows me to parse this database ?

Something like this

wget -r -np -k http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet\?method\=loadAircraftConditionsResultPage\&enterpriseName\=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8\&licenceCode\=\&partsNumber\=\&partsName\=\&ataChaptersection\=

only gets me the website code not the content of the tables itself.

Upvotes: 1

Views: 617

Answers (3)

hanshenrik
hanshenrik

Reputation: 21573

Is there any curl or wget command that allows me to parse this database

not really, neither curl nor wget knows how to parse JSON, nor how to write CSV.

php-cli is one option, the command

php -r 'function csv_quote(string $str): string {     return '\''"'\'' . strtr($str, ['\''"'\'' => '\''""'\'']) . '\''"'\''; } function shittycvs(array $d): void {     $str = '\'''\'';     $fields = [];     foreach ($d as $foo) {         foreach ($foo as $key => $_) {              $fields[$key] = true;         }     }     $fields = array_keys($fields);     $str .= implode(",", $fields) . "\n";     foreach ($d as $v) {         foreach ($fields as $field) {             if (is_array($v[$field] ?? "")) {                 $v[$field] = print_r($v[$field], true);             }             $str .= csv_quote($v[$field] ?? "") . ",";         }         $str = substr($str, 0, -1) . "\n";     }     echo $str; }  $ch = curl_init(); curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => '\'''\'')); $data = []; for ($i = 0; $i < 100; $i += 20) {     $url = '\''http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?'\'' . http_build_query(array(         '\''method'\'' => '\''aircraftConditionsQuery'\'',         '\''iColumns'\'' => 10,         '\''sColumns'\'' => '\'''\'',         '\''mDataProp_0'\'' => '\''enterpriseName'\'',         '\''mDataProp_1'\'' => '\''licenceCode'\'',         '\''mDataProp_2'\'' => '\''partsNumber'\'',         '\''mDataProp_3'\'' => '\''partsName'\'',         '\''mDataProp_4'\'' => '\''ataChaptersection'\'',         '\''mDataProp_5'\'' => '\''manufacturers'\'',         '\''mDataProp_6'\'' => '\''6'\'',         '\''mDataProp_7'\'' => '\''fileToAccord'\'',         '\''mDataProp_8'\'' => '\''mainDevices'\'',         '\''mDataProp_9'\'' => '\''remark'\'',         '\''enterpriseName'\'' => '\'''\'',         '\''licenceCode'\'' => '\'''\'',         '\''partsNumber'\'' => '\'''\'',         '\''partsName'\'' => '\'''\'',         '\''ataChaptersection'\'' => '\'''\'',         '\''iDisplayLength'\'' => 20,         '\''iDisplayStart'\'' => $i,     ));     curl_setopt($ch, CURLOPT_URL, $url);     $js = curl_exec($ch);     $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);     if (empty($jsd['\''aaData'\''])) {         /*on last page aaData is empty..*/         break;     }     foreach ($jsd['\''aaData'\''] as $v) {         $data[] = $v;     } } shittycvs($data);'

outputs https://gist.github.com/divinity76/236d1df60a6b29783fcfb90fac12d7bf

it fetches the first 100 results, but there are over 46,000 results in total, to fetch them all, replace $i < 100 with $i < 999999 or something, from there it should break on /*on last page aaData is empty..*/ break; after SEVERAL HOURS.

and that shell command unpacked looks like this:

<?php
function csv_quote(string $str): string
{
    return '"' . strtr($str, ['"' => '""']) . '"';
}
function shittycvs(array $d): void
{
    $str = '';
    $fields = [];
    foreach ($d as $foo) {
        foreach ($foo as $key => $_) {

            $fields[$key] = true;
        }
    }
    $fields = array_keys($fields);
    $str .= implode(",", $fields) . "\n";
    foreach ($d as $v) {
        foreach ($fields as $field) {
            if (is_array($v[$field] ?? "")) {
                $v[$field] = print_r($v[$field], true);
            }
            $str .= csv_quote($v[$field] ?? "") . ",";
        }
        $str = substr($str, 0, -1) . "\n";
    }
    echo $str;
}

$ch = curl_init();
curl_setopt_array($ch, array(CURLOPT_RETURNTRANSFER => 1, CURLOPT_VERBOSE => 1, CURLOPT_ENCODING => ''));
$data = [];
for ($i = 0; $i < 100; $i += 20) {
    $url = 'http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?' . http_build_query(array(
        'method' => 'aircraftConditionsQuery',
        'iColumns' => 10,
        'sColumns' => '',
        'mDataProp_0' => 'enterpriseName',
        'mDataProp_1' => 'licenceCode',
        'mDataProp_2' => 'partsNumber',
        'mDataProp_3' => 'partsName',
        'mDataProp_4' => 'ataChaptersection',
        'mDataProp_5' => 'manufacturers',
        'mDataProp_6' => '6',
        'mDataProp_7' => 'fileToAccord',
        'mDataProp_8' => 'mainDevices',
        'mDataProp_9' => 'remark',
        'enterpriseName' => '',
        'licenceCode' => '',
        'partsNumber' => '',
        'partsName' => '',
        'ataChaptersection' => '',
        'iDisplayLength' => 20,
        'iDisplayStart' => $i,
    ));
    curl_setopt($ch, CURLOPT_URL, $url);
    $js = curl_exec($ch);
    $jsd = json_decode($js, true, 999, JSON_THROW_ON_ERROR | JSON_INVALID_UTF8_SUBSTITUTE);
    if (empty($jsd['aaData'])) {
        /*on last page aaData is empty..*/
        break;
    }
    foreach ($jsd['aaData'] as $v) {
        $data[] = $v;
    }
}
shittycvs($data);

Upvotes: 0

baduker
baduker

Reputation: 20052

Since this has a web-scraping tag, I thought you could give Python a try if you want to scrape ALL tables from that site.

And by ALL I mean all 2347 pages to a single .csv file that looks like this:

enter image description here

Before I move on to the code, I'd like to share a small disclaimer:

The code below relies heavily on multi-threading for requesting data from the server, which might result in connection refusal / forced interruption and/or getting banned from further connections.

Use it wisely and at your own responsibility.

Having said that, you don't really have to run the code below as the entire dump can be fetched as caac_gov_merged_.csv.

That way, you don't put the servers under unnecessary strain by scraping them excessively. If, however, you'd like to get that done yourself here's the code.

The code:

import functools
import json
import pathlib
import random
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path, PurePath
from urllib.parse import urlencode

import pandas as pd
import requests

BATCH_DIR = "./batches"
RESULTS_DIR = "./batch_results"
DEFAULT_BATCH_SIZE = 250


def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
        return value
    return wrapper_timer


def build_payloads() -> list:
    return [
        [
            ('method', 'aircraftConditionsQuery'),
            ('iDisplayStart', page),
            ('iDisplayLength', '20'),
            ('enterpriseName', '武汉航达航空科技发展有限公司'),
        ] for page in [i for i in range(0, 46921, 10)]
    ]


def create_urls(payloads: list) -> list:
    base_url = "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?"
    return [f"{base_url}{urlencode(payload)}" for payload in payloads]


def urls_to_batches(urls: list, step: int = DEFAULT_BATCH_SIZE) -> list:
    return [urls[i:i+step] for i in range(0, len(urls), step)]


def prepare_batches(url_batches: list, batch_dir: str = BATCH_DIR):
    pathlib.Path(batch_dir).mkdir(parents=False, exist_ok=False)
    for number, batch in enumerate(url_batches, start=1):
        batch_file = PurePath(batch_dir).joinpath(f"batch_{number}.batch")
        with open(batch_file, "w") as out:
            out.writelines("\n".join(batch))


def previous_jobs() -> bool:
    return Path(BATCH_DIR).exists()


def read_files(batch_dir: str = BATCH_DIR) -> iter:
    yield from (path for path in Path(batch_dir).iterdir())


def open_batch_file(batch_file_path: Path) -> list:
    with open(Path.joinpath(batch_file_path)) as batch_file:
        return [line.strip() for line in batch_file.readlines()]


def get_data(api_url: str) -> list:
    return requests.get(api_url).json()["aaData"]


@timer
def thread(batch_urls: list) -> list:
    results = []
    with ThreadPoolExecutor() as executor:
        for result in [executor.submit(get_data, url) for url in batch_urls]:
            results.extend(result.result())
    return results


def dump_thread_results(
        results: list,
        batch_file_path: Path,
        batch_results: str = RESULTS_DIR
):
    pathlib.Path(batch_results).mkdir(parents=False, exist_ok=True)
    output_file = f"{batch_file_path.name.rsplit('.')[0]}.json"
    with open(PurePath(batch_results).joinpath(output_file), "w") as out:
        json.dump(results, out, indent=4, sort_keys=True)


def wait(start: int = 60, stop: int = 180):
    sleep_for = random.randint(start, stop)
    print(f"Sleeping for {sleep_for} seconds.")
    time.sleep(sleep_for)


def scrape_data():
    for batch in read_files():
        print(f"Processing {batch}...")
        print(f"There are {len(list(read_files()))} batches left.")
        try:
            dump_thread_results(thread(open_batch_file(batch)), batch)
            Path(batch).unlink()
            wait()
        except (
                requests.exceptions.ConnectionError,
                requests.exceptions.ChunkedEncodingError,
        ) as error:
            print(f"Connection failed: {error}")
            wait(start=180, stop=360)
            continue
    Path(BATCH_DIR).rmdir()


def merge_results(file_name: str = "caac_gov_merged_data.csv"):
    merged_batch_results = []
    for result_file in read_files(Path(RESULTS_DIR)):
        with open(result_file) as result:
            merged_batch_results.extend(json.load(result))
    pd.DataFrame(
        merged_batch_results
    ).drop(
        "pageScrollParas",
        axis=1,
    ).dropna(
        how="all",
        axis=1,
    ).to_csv(
        Path(RESULTS_DIR).joinpath(file_name),
        index=False,
    )


def run_scrapper():
    scrape_data()
    merge_results()


@timer
def main():
    if previous_jobs():
        run_scrapper()
    else:
        prepare_batches(urls_to_batches(create_urls(build_payloads())))
        run_scrapper()


if __name__ == "__main__":
    main()

Upvotes: 2

aborruso
aborruso

Reputation: 5728

If you want a json output the URL is

http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart=0&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=

In the URL you have pagination in the iDisplayStart parameter, starting from 0 and with 20 results steps.

Using gnu parallel, jq and Miller you can download all and transform in CSV.

In example to download the first 6 json outputs (from 0 to 100), you can run

echo {0..100..20} | tr " " "\n" | parallel -j0 'curl "http://fsop.caac.gov.cn/g145/CARS/WebSiteQueryServlet?method=aircraftConditionsQuery&iColumns=10&sColumns=&iDisplayStart={}&iDisplayLength=20&mDataProp_0=enterpriseName&mDataProp_1=licenceCode&mDataProp_2=partsNumber&mDataProp_3=partsName&mDataProp_4=ataChaptersection&mDataProp_5=manufacturers&mDataProp_6=6&mDataProp_7=fileToAccord&mDataProp_8=mainDevices&mDataProp_9=remark&enterpriseName=%E6%AD%A6%E6%B1%89%E8%88%AA%E8%BE%BE%E8%88%AA%E7%A9%BA%E7%A7%91%E6%8A%80%E5%8F%91%E5%B1%95%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&licenceCode=&partsNumber=&partsName=&ataChaptersection=" | jq -c '"'"'.aaData[]'"'"' >{}.jsonl'

In output you will have 6 json files.

To merge them in a CSV, you can run

mlr --j2c unsparsify *.jsonl >output.csv

It will be something like https://gist.github.com/aborruso/777fce957865011c85a7a689cd0bd413

Upvotes: 2

Related Questions