Reputation: 77
I'm setting up CDAP in my Google Cloud Environment, but having problems to execute the following pipeline: run a query on BigQuery and save the result in a CSV file on Google Storage.
My process was:
Install CDAP using the CDAP OSS image at Google Marketplace.
Build the following pipeline:
{
"artifact": {
"name": "cdap-data-pipeline",
"version": "6.0.0",
"scope": "SYSTEM"
},
"description": "Data Pipeline Application",
"name": "cdap_dsc_test",
"config": {
"resources": {
"memoryMB": 2048,
"virtualCores": 1
},
"driverResources": {
"memoryMB": 2048,
"virtualCores": 1
},
"connections": [
{
"from": "BigQuery",
"to": "Google Cloud Storage"
}
],
"comments": [],
"postActions": [],
"properties": {},
"processTimingEnabled": true,
"stageLoggingEnabled": true,
"stages": [
{
"name": "BigQuery",
"plugin": {
"name": "BigQueryTable",
"type": "batchsource",
"label": "BigQuery",
"artifact": {
"name": "google-cloud",
"version": "0.12.2",
"scope": "SYSTEM"
},
"properties": {
"project": "bi-data-science",
"serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
"schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
"referenceName": "test_tables",
"dataset": "google_trends",
"table": "devices"
}
},
"outputSchema": [
{
"name": "etlSchemaBody",
"schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
}
]
},
{
"name": "Google Cloud Storage",
"plugin": {
"name": "GCS",
"type": "batchsink",
"label": "Google Cloud Storage",
"artifact": {
"name": "google-cloud",
"version": "0.12.2",
"scope": "SYSTEM"
},
"properties": {
"project": "bi-data-science",
"suffix": "yyyy-MM-dd",
"format": "json",
"serviceFilePath": "/home/ubuntu/bi-data-science-cdap-4cbf526de374.json",
"schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}",
"delimiter": ",",
"referenceName": "gcs_cdap",
"path": "gs://hurb_sandbox/cdap_experiments/"
}
},
"outputSchema": [
{
"name": "etlSchemaBody",
"schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
}
],
"inputSchema": [
{
"name": "BigQuery",
"schema": "{\"type\":\"record\",\"name\":\"etlSchemaBody\",\"fields\":[{\"name\":\"destination_name\",\"type\":[\"string\",\"null\"]},{\"name\":\"destination_country\",\"type\":[\"string\",\"null\"]},{\"name\":\"timestamp\",\"type\":[\"double\",\"null\"]},{\"name\":\"desktop\",\"type\":[\"double\",\"null\"]},{\"name\":\"tablet\",\"type\":[\"double\",\"null\"]},{\"name\":\"mobile\",\"type\":[\"double\",\"null\"]}]}"
}
]
}
],
"schedule": "0 * * * *",
"engine": "mapreduce",
"numOfRecordsPreview": 100,
"description": "Data Pipeline Application",
"maxConcurrentRuns": 1
}
}
The credential key has owner privileges and I'm able to access the query result using the "preview" option.
Pipeline result:
Files:
None csv file has been generated and I'm also not found a place where I can set a name to my output file in CDAP. Did I miss any configuration step?
We eventualy gave up on CDAP, and we're using Google DataFlow.
Upvotes: 1
Views: 530
Reputation: 431
When configuring the GCS sink in the pipeline, there is a 'format' field, which you have set to JSON. You can set this to CSV to achieve the format you would like.
Upvotes: 1