Joshua Swiss
Joshua Swiss

Reputation: 305

Javascript regex ignore first capture group

I asked a similar question before and thought I had the correct answer, but later realized I was capturing some strings I should not be.

I am trying to parse a large text file and pull certain elements out with regex. I'm using Node for my site, so I'm doing this in Javascript.

In the examples below, I am trying to match 10 strings of numbers with commas and periods. In the first example, I match the right pattern, but I capture two outlying strings (I only want the numbers at the end of the lines starting with " 4 0000....").

https://regex101.com/r/nO8nM1/8

In this example, I match the right instances of the string, but I am not able to ignore the first capture group, so additional characters and whitespaces are included.

https://regex101.com/r/uB6hE4/1

Regex:

/(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm

sample data:

                  23205        - Grants Current-County Operatin                        4,425,327.00"

"    4   0000047387         Central Equatoria State          1003-1478 Sta Hosp Oper Oct                   85,784.00"
"    4   0000047442         EASTERN EQUATORIA ST             1003-1479 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047485         JONGLEI STATE                    1003-1519 Sta Hosp Oper Oct                  144,608.00"
"    4   0000047501         Lakes State                      1003-1482 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047528         Unity State                      1003-1484 Sta Hosp Oper Oct                   75,980.00"
"    4   0000047532         Northern Bahr-el State           1003-1483 Sta Hosp Oper Oct                   58,824.00"
"    4   0000047615         Western E State                  1003-1488 Sta Hosp Oper Oct                   93,137.00"
"    4   0000047638         Warap State                      1003-1486 Sta Hosp Oper Oct                   51,471.00"
"    4   0000047680         Upper Nile State                 1003-1485 Capitation                  102,941.00"
"    4   0000047703         Western BG State                 1003-1487 Sta Hosp Oper Oct                   34,314.00"
                                                                                             ----------------------
"        Total For Period          4                                                                      833,333.00"
 ----------------------------------------------------------------------------------------------------------------------------
 Fiscal Year        2015/16                               Republic Of South Sudan                         Date     2015/11/20
 Period                   5                                                                               Time       12:58:40
                                                  FreeBalance Financial Management System                 Page              7
 ----------------------------------------------------------------------------------------------------------------------------
                                                            Vendor Analysis Report

                                                              1091 Health (MOH)
  Prd   Voucher #          Vendor Name                      Description                          Amount
  ---   ----------------   ------------------------------   -----------------------------    ----------------------
                                                                                             ----------------------
"  

(\d+,\d+,\d+.\d+)(?=")

Regex 2:

/(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm

In my code I push these values to an array of objects if they are present. I tried only pushing the match group relevant to what I want, but it results in pushing only the index item from the matches.

I've had a hard time trying several different combinations of ?:, ?=, and ?! to ignore the first capture group in the second link to no avail. I feel like the solution must be fairly simple, but I can't quite get there. Any thoughts on what I'm doing wrong?

My code:

var openFile = function(event) {
    var input = event.target;
    var reader = new FileReader();
  reader.onload = function() {
    var text = reader.result;
    // console.log(text.substring(0, 999999999999999));
      var section = text.substring(0, 9999999999999999);
      var subSection = [];
      console.log(typeof subSection);
      var masterArray = new Object();
      var uploadDate = "";
      var period = "";
      var transferArray = [];
      var subSectionRegex = /   Total([\s\S]*?)Total|^\s+\d{4,5}([\s\S]*?)Total F/gm;
      var transferCodeRegex = /[0-9]{4,5}/;
      var voucherNumberRegex = /([0-9]{7,10}[\S])(?=\s+)/g;
      var vendorRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)/gm;
      var descriptionRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+\s(\s\S+)?)(?=\s+100)/g;
      // var descriptionRegex = /(\d{4}-\d{4})(\D+)*\s\D/g;
      var amountRegex = /(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm;
      // var amountRegex = /(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm;
      // var amountRegex = /\w\s{10,20}(\d+(?:,\d{3})*\.\d+)/gm;
      var oneLineAmountRegex = /(\d+,\d+,\d+.\d+)|\d+,\d+.\d+/g;
      var oneLineDescRegex = / - (\D+)|- \d+(\D+)/gm;
      var allData = [{}];

      console.log('section: ' + typeof section);
        subSection = section.match(subSectionRegex);
        subSection = subSection.filter(Boolean);
        console.log(typeof subSection);

      function extractDate() {
        uploadDate = section.match(/Date (.*)/)[1].trim();
        uploadDate = new Date(uploadDate);
        allData["uploadDate"] = uploadDate;
      }
      extractDate();
      // console.log(allData.uploadDate);

      function extractPeriod() {
        period = section.match(/Period (.*)/)[1].trim();
        period = period.split(" ");
        period = period[0];
        period = parseInt(period);
        // console.log("period: " + period);
        allData["period"] = period;
      }
      extractPeriod();
      // console.log(allData.period);

      function extractDetails() {
        for(var i = 0; i < subSection.length; i++) {
            if(subSection[i].match(transferCodeRegex) && subSection[i].match(voucherNumberRegex) && subSection[i].match(vendorRegex) && subSection[i].match(descriptionRegex) && subSection[i].match(amountRegex)) {
                transferArray.push({
                    "transferCode": subSection[i].match(transferCodeRegex),
                    "details": [{
                        "voucherNumber": subSection[i].match(voucherNumberRegex),
                        "vendor": subSection[i].match(vendorRegex),
                        "description": subSection[i].match(descriptionRegex),
                        "total": subSection[i].match(amountRegex)
                    }]
                })
            } else {
                transferArray.push({
                    "transferCode": subSection[i].match(transferCodeRegex),
                    "details": [{
                        "voucherNumber": subSection[i].match(voucherNumberRegex),
                        "description": subSection[i].match(oneLineDescRegex),
                        "total": subSection[i].match(oneLineAmountRegex)
                    }]
                })
            }
        }
      }

    function removeNulls(obj) {
            var isArray = obj instanceof Array;
            for(var k in obj) {
                console.log('k: ' + k);
            if(obj[k] === null || obj[k] === undefined) isArray ? obj.splice(k, 1) : delete obj[k];
            else if (typeof obj[k] === "object") removeNulls(obj[k]);
            }
        }

        removeNulls(transferArray);
        console.log(transferArray);
        console.log(JSON.stringify(transferArray, null, 2))

        function cleanData() {
            transferArray.forEach(function(e) {
                console.log(e)
            e.details.forEach(function(evt) {
                console.log(evt)
                console.log(evt.amount)
                console.log(evt.description)
            for(i = 0; i < evt.amount.length; i++) {
              // evt.amount[i] = evt.amount[i].toString();
              // evt.amount[i] = evt.amount[i].replace(/^[a-zA-Z]\s+/g, '');
              evt.amount[i] = parseFloat(evt.amount[i].replace(/\,/g, ""));
            }
            for(i = 0; i < evt.description.length; i++) {
            evt.description[i] = evt.description[i].toString();
            evt.description[i] = evt.description[i].trim();
            }
            return(evt);
            })
                // console.log(evt.amount);
                // console.log(evt.description);
            });
        }
        cleanData();
        console.log(transferArray);

        console.log(transferArray);

            //adds detailed data to allData array
      allData["section"] = transferArray;

      extractDetails();
      console.log(allData);

      function pushArrayToObject() {

      }
  };
  reader.readAsText(input.files[0]);
};

Upvotes: 0

Views: 687

Answers (1)

Jodevan
Jodevan

Reputation: 750

Does that work for you?

As @stribizhev stated, JS doesn't support lookbehind. However, you can tweak your regex to get everything that is enclosed between " and then everything you have to do is to capture the single group returned.

This is the expression:

/^".*\s+([\d{1,3},]*\d{1,3}.\d{2})"$/;

So wherever you need to get the totalAmount (assuming this is the value you're looking for), you can just do it in this way:

subSection[i].match(oneLineAmountRegex[1])

Upvotes: 1

Related Questions