David Beyda
David Beyda

Reputation: 77

How to create an ETL from BigQuery to Google Storage using CDAP?

I'm setting up CDAP in my Google Cloud Environment, but having problems to execute the following pipeline: run a query on BigQuery and save the result in a CSV file on Google Storage.

My process was:

  1. Install CDAP using the CDAP OSS image at Google Marketplace.

  2. Build the following pipeline:

{
    "artifact": {
        "name": "cdap-data-pipeline",
        "version": "6.0.0",
        "scope": "SYSTEM"
    },
    "description": "Data Pipeline Application",
    "name": "cdap_dsc_test",
    "config": {
        "resources": {
            "memoryMB": 2048,
            "virtualCores": 1
        },
        "driverResources": {
            "memoryMB": 2048,
            "virtualCores": 1
        },
        "connections": [
            {
                "from": "BigQuery",
                "to": "Google Cloud Storage"
            }
        ],
        "comments": [],
        "postActions": [],
        "properties": {},
        "processTimingEnabled": true,
        "stageLoggingEnabled": true,
        "stages": [
            {
                "name": "BigQuery",
                "plugin": {
                    "name": "BigQueryTable",
                    "type": "batchsource",
                    "label": "BigQuery",
                    "artifact": {
                        "name": "google-cloud",
                        "version": "0.12.2",
                        "scope": "SYSTEM"
                    },
                    "properties": {
                        "project": "bi-data-science",
                        "serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
                        "referenceName": "test_tables",
                        "dataset": "google_trends",
                        "table": "devices"
                    }
                },
                "outputSchema": [
                    {
                        "name": "etlSchemaBody",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ]
            },
            {
                "name": "Google Cloud Storage",
                "plugin": {
                    "name": "GCS",
                    "type": "batchsink",
                    "label": "Google Cloud Storage",
                    "artifact": {
                        "name": "google-cloud",
                        "version": "0.12.2",
                        "scope": "SYSTEM"
                    },
                    "properties": {
                        "project": "bi-data-science",
                        "suffix": "yyyy-MM-dd",
                        "format": "json",
                        "serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
                        "delimiter": ",",
                        "referenceName": "gcs_cdap",
                        "path": "gs://hurb_sandbox/cdap_experiments/"
                    }
                },
                "outputSchema": [
                    {
                        "name": "etlSchemaBody",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ],
                "inputSchema": [
                    {
                        "name": "BigQuery",
                        "schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
                    }
                ]
            }
        ],
        "schedule": "0 * * * *",
        "engine": "mapreduce",
        "numOfRecordsPreview": 100,
        "description": "Data Pipeline Application",
        "maxConcurrentRuns": 1
    }
}

The credential key has owner privileges and I'm able to access the query result using the "preview" option.

Pipeline result:

Files:

None csv file has been generated and I'm also not found a place where I can set a name to my output file in CDAP. Did I miss any configuration step?

Update:

We eventualy gave up on CDAP, and we're using Google DataFlow.

Upvotes: 1

Views: 530

Answers (1)

Ali Anwar
Ali Anwar

Reputation: 431

When configuring the GCS sink in the pipeline, there is a 'format' field, which you have set to JSON. You can set this to CSV to achieve the format you would like.

Upvotes: 1

Related Questions