Reputation: 99
I am using Cloud Functions to run a Query across a partitioned table in BigQuery. The table is 1GB in size.
The script pulls all data matching an id and streams each into Google Cloud Storage bucket. It performs this for all ids in the table (around 100 times). The script is working, however it runs slowly and only pulls nine merchants before the Cloud Function runs out of memory. Increasing the memory in the build menu doesn't have any effect.
Is there a way to speed up this operation and reduce the amount of memory it uses each time?
const { BigQuery } = require("@google-cloud/bigquery");
const { Storage } = require("@google-cloud/storage");
const bucketName = "xxxx";
const stream = require("stream");
const { parse, Parser } = require("json2csv");
const bigquery = new BigQuery();
const storage = new Storage();
const fields = [
"id",
"product_name",
"product_desc",
"etc"
];
exports.importBQToGCS = (req, res) => {
"use strict";
const liveMerchantCount = 113;
(async function () {
try {
for (let i = 1; i < liveMerchantCount; i += 1) {
const query =
`SELECT *
FROM \`table_name\`
WHERE id_number = ` + i;
const options = {
query: query,
location: "EU",
};
const [job] = await bigquery.createQueryJob(options);
console.log(`Job ${job.id} started.`);
const createFile = storage.bucket(bucketName).file(`test_${i}.csv`);
const [rows] = await job.getQueryResults();
const csv = parse(rows, { fields });
const dataStream = new stream.PassThrough();
dataStream.push(csv);
dataStream.push(null);
await new Promise((resolve, reject) => {
console.log("Writing to GCS");
dataStream
.pipe(
createFile.createWriteStream({
resumable: false,
validation: false,
metadata: { "Cache-Control": "public, max-age=31536000" },
})
)
.on("error", (error) => {
console.error("Stream failed", error);
reject(error);
})
.on("finish", () => {
resolve(true);
});
});
}
res.status(200).send();
} catch (err) {
res.send(err);
}
})();
};
Upvotes: 0
Views: 401
Reputation: 10712
You could reduce the memory footprint by streaming records from BigQuery. If you use the @databases client, it has a queryNodeStream
method for exactly this use case: https://www.atdatabases.org/docs/bigquery-client#bigqueryclient-querynodestream
const connectBigQuery, {sql} = require("@databases/bigquery");
const { Storage } = require("@google-cloud/storage");
const bucketName = "xxxx";
const stream = require("stream");
const { parse, Parser } = require("json2csv");
const bigquery = connectBigQuery();
const storage = new Storage();
const fields = [
"id",
"product_name",
"product_desc",
"etc"
];
exports.importBQToGCS = (req, res) => {
"use strict";
const liveMerchantCount = 113;
(async function () {
try {
for (let i = 1; i < liveMerchantCount; i += 1) {
await new Promise((resolve, reject) => {
console.log("Writing to GCS");
bigquery.queryNodeStream(sql`
SELECT * FROM table_name WHERE id_number = ${i}
`)
.on('error', reject)
.pipe(someStreamingCsvLibrary({ fields }))
.on('error', reject)
.pipe(
createFile.createWriteStream({
resumable: false,
validation: false,
metadata: { "Cache-Control": "public, max-age=31536000" },
})
)
.on("error", (error) => {
console.error("Stream failed", error);
reject(error);
})
.on("finish", () => {
resolve(true);
});
});
}
res.status(200).send();
} catch (err) {
res.send(err);
}
})();
};
Upvotes: 0
Reputation: 99
On further testing the issue seems to be that BigQuery is returning a particularly large JSON file (250MB) for one merchant that is wiping out memory for this function. All the others are below 100MB. Once I excluded this particular query the function works as expected.
Upvotes: 1