PawelC
PawelC

Reputation: 1226

Node.js split file lines

I want to write a script that divides the lines read from the file into packages of 25, unfortunately the sample package returns 40 codes. I would like to do so that, for example, he divided me into packages of 25 items. I mean, I have, for example, 60 codes, this creates 2 packages of 25, and one with 10 codes. Unfortunately, I can't handle it.

const fs = require('fs');
fs.readFile('code.txt', function (err, data) {
    if (err) throw err;
    const array = data.toString().split("\n");
    let count = 0;

    let items = [];
    for (let i in array) {
        items.push({
            PutRequest: {
                Item: {
                    code: array[i]
                }
            }
        });

        let params = {
            RequestItems: {
                'TABLE_NAME': items
            }
        };

        if (count === 25) {
            dynamoDB.batchWrite(params, function (err, data) {
                if (err) {
                    console.log(err);
                } else {
                    count = 0;
                    items = [];
                }
            });

        }else{
            count++;
        }
    }
});

code.txt content

https://0bin.net/paste/NA8-4hkq#1Ohwt5uUkQqE0YscwnxTX2gxEqlvAUVKp1JRipBCsZg

Any idea what I do wrong?

Upvotes: 0

Views: 1582

Answers (1)

derpirscher
derpirscher

Reputation: 17436

Your dynamoDB.batchWrite() is asynchronous. Thus its callback is executed only after the loop has completed. So items and count are never reset ...

The easiest would be, if you could switch to an promise based approach like the following

const BATCHSIZE = 25;
const fs = require('fs').promises;

async function batchLoad() {
   const lines = (await fs.readFile("code.txt", "utf-8")).split("\n");
   while (lines.length > 0) {
      const items = lines.splice(0, BATCHSIZE).map(l => ({PutRequest: {Item: { code: l }}}));
      const params = { RequestItems: { TABLE_NAME: items}};

      await new Promise((resolve, reject) => {
        dynamoDb.batchWrite(params, (err) => {
          if (err) return reject(err);
          resolve();
        });
      });
   }

}

A callback based approach could look like this

const BATCHSIZE = 25;

fs.readFile("code.txt", "utf-8", (err, data) => {
  const lines = data.split("\n");

  function writeBatch() {
    if (!lines.length) return;
    const items = lines.splice(0, BATCHSIZE).map(l => ({PutRequest: {Item: { code: l }}}));
    const params = { RequestItems: { TABLE_NAME: items}};

    dynamoDb.batchWrite(params, err => {
      if (err) ...
      else writeBatch();
    });
  }

  writeBatch();
} 

The function writeBatch takes a certain number of lines from your original array and writes them into the database. Only afer the write into the DB was successful, it recursively calls itself and handles the next batch. But be aware, that this approach may exceed the maximum call stack size and throw an error.

You can also make either of this approaches not manipulate the lines array (which may be quite expensive), but just get out the current slice

const BATCHSIZE = 25;
const fs = require('fs').promises;

async function batchLoad() {
   const lines = (await fs.readFile("code.txt", "utf-8")).split("\n");
   let currentIndex = 0;
   while (currentIndex < lines.length) {
      const items = lines.slice(currentIndex, currentIndex + BATCHSIZE).map(l => ({PutRequest: {Item: { code: l }}}));
      const params = { RequestItems: { TABLE_NAME: items}};

      await new Promise((resolve, reject) => {
        dynamoDb.batchWrite(params, (err) => {
          if (err) return reject(err);
          resolve();
        });
      });
      currentIndex += BATCHSIZE;
   }

}

and

const BATCHSIZE = 25;

fs.readFile("code.txt", "utf-8", (err, data) => {
  const lines = data.split("\n");
  function writeBatch(currentIndex) {
    if (currentIndex >= lines.length) return;
    const items = lines.slice(currentIndex, currentIndex + BATCHSIZE).map(l => ({PutRequest: {Item: { code: l }}}));
    const params = { RequestItems: { TABLE_NAME: items}};

    dynamoDb.batchWrite(params, err => {
      if (err) ...
      else writeBatch(currentIndex + BATCHSIZE);
    });
  }

  writeBatch(0);
} 

To prevent stumbling into a maximum callstack exception you may also add the next batch to the eventloop and not call it recursively. Ie

dynamoDb.batchWrite(params, err => {
      if (err) ...
      else setTimeout(()=> { writeBatch(currentIndex + BATCHSIZE);}, 0);
    });

This way you won't build up a massive callstack from recursive calls.

To keep track of how many records are already saved to the db you could simply store the current counter in a file. When you restart the process, load that file and check how many lines to skip. Don't forget to remove the file, once all records have been saved ... For example with the first approach:

const BATCHSIZE = 25;
const fs = require('fs').promises;

async function batchLoad() {
   const lines = (await fs.readFile("code.txt", "utf-8")).split("\n");
   const skipLines = 0;
   try {
     skipLines = +(await fs.readFile("skip.txt", "utf-8"));
     if (isNaN(skipLines)) skipLines = 0;
     lines.splice(0, skipLines);
   } catch (e) {
     skipLines = 0;
   }
   while (lines.length > 0) {
      const items = lines.splice(0, BATCHSIZE).map(l => ({PutRequest: {Item: { code: l }}}));
      const params = { RequestItems: { TABLE_NAME: items}};

      await new Promise((resolve, reject) => {
        dynamoDb.batchWrite(params, (err) => {
          if (err) return reject(err);
          resolve();
        });
      });
      skipLines += BATCHSIZE;
      await fs.writeFile("skip.txt", `${skipLines}`);
   }

   try {
     await fs.unlink("skip.txt");
   } catch (e) {
   }
}

Upvotes: 1

Related Questions