Reputation: 2876
I'm uploading some facebook post data into BigQuery. So I have the basics information : post name, post message, reach, likes, etc...
I've clean all the post name and post message by deleting all the "
however I still have the following error :
file-00000000: Error detected while parsing row starting at position: 15934. Error: Missing close double quote (") character. (error code: invalid)
Is anything else than "
can cause this error ?
I'm exporting the data from a googlesheet to BQ so here is my script if needed :
function BQ_fb_export() {
var projectId = 'XXXXX';
var fileId = 'XXXXXXX';
var tableId = 'XXXXXXX'
// Define our load job.
var jobSpec = {
configuration: {
load: {
destinationTable: {
projectId: projectId,
datasetId: 'Facebook',
tableId: tableId
},
allowJaggedRows: true,
writeDisposition: 'WRITE_TRUNCATE',
schema: {
fields: [
{name: 'Page_ID', type: 'STRING'},
{name: 'Post_ID', type: 'STRING'},
{name: 'Post_creation_date', type: 'STRING'},
{name: 'Post_name', type: 'STRING'},
{name: 'Post_message', type: 'STRING'},
{name: 'Link_to_post', type: 'STRING'},
{name: 'Post_shared_link', type: 'STRING'},
{name: 'Post_type', type: 'STRING'},
{name: 'Post_reach', type: 'INTEGER'},
{name: 'Post_organic_reach', type: 'INTEGER'},
{name: 'Post_paid_reach', type: 'INTEGER'},
{name: 'Post_viral_reach', type: 'INTEGER'},
{name: 'Post_engaged_users', type: 'INTEGER'},
{name: 'Post_likes', type: 'INTEGER'},
{name: 'Post_shares', type: 'INTEGER'},
{name: 'Post_comments', type: 'INTEGER'},
{name: 'Post_link_clicks', type: 'INTEGER'},
{name: 'Video_views', type: 'INTEGER'},
]
}
}
}
};
var spreadsheet = SpreadsheetApp.openById(fileId);
var filename = spreadsheet.getName();
var sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName("Raw_data");
var Row_count = sheet.getLastRow();
var data = sheet.getDataRange().getValues();
var csvdata = "";
for (var row = 1; row < data.length && row < Row_count + 1; row++) {
for (var col = 0; col < data[row].length; col++) {
var punctRE = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~]/g;
var spaceRE = /\s+/g;
var cell = data[row][col].toString();
if (cell.match(/http/g) && !cell.match(/www.facebook.com/g) ) {
var cell = data[row][col].toString();
} else if (!cell.match(/www.facebook.com/g)){
var cell = data[row][col].toString().replace(punctRE, '').replace(spaceRE, ' ');
}
if (cell.indexOf(",") != -1) {
csvdata += "\"" + cell + "\"";
} else {
csvdata += cell;
}
if (col < data[row].length - 1) {
csvdata += ",";
}
}
csvdata += "\r\n";
}
Logger.log(csvdata)
var data = Utilities.newBlob(csvdata, "application/octet-stream");
// Execute the job.
BigQuery.Jobs.insert(jobSpec, projectId, data);
// This example assumes there is a sheet named "first"
}
Upvotes: 1
Views: 3659
Reputation: 1280
in my case ,adding below line to python code worked(full code below) As per docs --> allow_quoted_newlines = True.. Indicates whether to allow quoted data sections that contain newline characters in a CSV file. The default value is false.
allow_quoted_newlines = True
for bq load command add following
bq load --allow_quoted_newlines < rest of command >
Python code to load data in bigquery
from google.cloud import bigquery
from google.api_core.exceptions import BadRequest
client = bigquery.Client('project-id-b4f8d566')
dataset_id = 'census-ds'
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.allow_quoted_newlines = True
job_config.schema = [
bigquery.SchemaField("id", "INTEGER","REQUIRED"),
bigquery.SchemaField("code", "STRING" ,"NULLABLE"),
bigquery.SchemaField("answer", "STRING","NULLABLE")
]
job_config.skip_leading_rows = 1
# The source format defaults to CSV, so the line below is optional.
job_config.source_format = bigquery.SourceFormat.CSV
uri = "gs://mybucket/text.csv"
load_job = client.load_table_from_uri(
uri, dataset_ref.table("census_text"), job_config=job_config
) # API request
print("Starting job {}".format(load_job.job_id))
try:
load_job.result() # Waits for table load to complete.
print("Job finished.")
destination_table = client.get_table(dataset_ref.table("census_text"))
print("Loaded {} rows.".format(destination_table.num_rows))
except BadRequest as e:
for e in load_job.errors:
print('ERROR: {}'.format(e['message']))
docs https://cloud.google.com/bigquery/docs/loading-data-cloud-storage-csv#csv-options
Upvotes: 5
Reputation: 2876
I finally discovered that the issue was about newlines and not double quote like the error suggest it. So I also removed all the newlines from my post name and post message column and it worked perfectly. so here is my cleaning variable now :
var punctRE = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,\-.\/:;<=>?@\[\]^_`{|}~\r\n|\n|\r]/g;
Hope It could help someone else !
Upvotes: 3