Reputation: 305
I asked a similar question before and thought I had the correct answer, but later realized I was capturing some strings I should not be.
I am trying to parse a large text file and pull certain elements out with regex. I'm using Node for my site, so I'm doing this in Javascript.
In the examples below, I am trying to match 10 strings of numbers with commas and periods. In the first example, I match the right pattern, but I capture two outlying strings (I only want the numbers at the end of the lines starting with " 4 0000....").
https://regex101.com/r/nO8nM1/8
In this example, I match the right instances of the string, but I am not able to ignore the first capture group, so additional characters and whitespaces are included.
https://regex101.com/r/uB6hE4/1
Regex:
/(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm
sample data:
23205 - Grants Current-County Operatin 4,425,327.00"
" 4 0000047387 Central Equatoria State 1003-1478 Sta Hosp Oper Oct 85,784.00"
" 4 0000047442 EASTERN EQUATORIA ST 1003-1479 Sta Hosp Oper Oct 93,137.00"
" 4 0000047485 JONGLEI STATE 1003-1519 Sta Hosp Oper Oct 144,608.00"
" 4 0000047501 Lakes State 1003-1482 Sta Hosp Oper Oct 93,137.00"
" 4 0000047528 Unity State 1003-1484 Sta Hosp Oper Oct 75,980.00"
" 4 0000047532 Northern Bahr-el State 1003-1483 Sta Hosp Oper Oct 58,824.00"
" 4 0000047615 Western E State 1003-1488 Sta Hosp Oper Oct 93,137.00"
" 4 0000047638 Warap State 1003-1486 Sta Hosp Oper Oct 51,471.00"
" 4 0000047680 Upper Nile State 1003-1485 Capitation 102,941.00"
" 4 0000047703 Western BG State 1003-1487 Sta Hosp Oper Oct 34,314.00"
----------------------
" Total For Period 4 833,333.00"
----------------------------------------------------------------------------------------------------------------------------
Fiscal Year 2015/16 Republic Of South Sudan Date 2015/11/20
Period 5 Time 12:58:40
FreeBalance Financial Management System Page 7
----------------------------------------------------------------------------------------------------------------------------
Vendor Analysis Report
1091 Health (MOH)
Prd Voucher # Vendor Name Description Amount
--- ---------------- ------------------------------ ----------------------------- ----------------------
----------------------
"
(\d+,\d+,\d+.\d+)(?=")
Regex 2:
/(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm
In my code I push these values to an array of objects if they are present. I tried only pushing the match group relevant to what I want, but it results in pushing only the index item from the matches.
I've had a hard time trying several different combinations of ?:
, ?=
, and ?!
to ignore the first capture group in the second link to no avail. I feel like the solution must be fairly simple, but I can't quite get there. Any thoughts on what I'm doing wrong?
My code:
var openFile = function(event) {
var input = event.target;
var reader = new FileReader();
reader.onload = function() {
var text = reader.result;
// console.log(text.substring(0, 999999999999999));
var section = text.substring(0, 9999999999999999);
var subSection = [];
console.log(typeof subSection);
var masterArray = new Object();
var uploadDate = "";
var period = "";
var transferArray = [];
var subSectionRegex = / Total([\s\S]*?)Total|^\s+\d{4,5}([\s\S]*?)Total F/gm;
var transferCodeRegex = /[0-9]{4,5}/;
var voucherNumberRegex = /([0-9]{7,10}[\S])(?=\s+)/g;
var vendorRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)/gm;
var descriptionRegex = /(?!\d{10})(\S+\s\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+(\s\S+)?)(?=\s+100)|(?!\d{10})(\S+\s(\s\S+)?)(?=\s+100)/g;
// var descriptionRegex = /(\d{4}-\d{4})(\D+)*\s\D/g;
var amountRegex = /(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+.\d+)(?=")|(?:\s\w{3}\s+|Capitation\s+)(\d+,\d+,\d+.\d+)(?=")/gm;
// var amountRegex = /(\d+,\d+.\d+)(?=")|(\d+,\d+,\d+.\d+)(?=")/gm;
// var amountRegex = /\w\s{10,20}(\d+(?:,\d{3})*\.\d+)/gm;
var oneLineAmountRegex = /(\d+,\d+,\d+.\d+)|\d+,\d+.\d+/g;
var oneLineDescRegex = / - (\D+)|- \d+(\D+)/gm;
var allData = [{}];
console.log('section: ' + typeof section);
subSection = section.match(subSectionRegex);
subSection = subSection.filter(Boolean);
console.log(typeof subSection);
function extractDate() {
uploadDate = section.match(/Date (.*)/)[1].trim();
uploadDate = new Date(uploadDate);
allData["uploadDate"] = uploadDate;
}
extractDate();
// console.log(allData.uploadDate);
function extractPeriod() {
period = section.match(/Period (.*)/)[1].trim();
period = period.split(" ");
period = period[0];
period = parseInt(period);
// console.log("period: " + period);
allData["period"] = period;
}
extractPeriod();
// console.log(allData.period);
function extractDetails() {
for(var i = 0; i < subSection.length; i++) {
if(subSection[i].match(transferCodeRegex) && subSection[i].match(voucherNumberRegex) && subSection[i].match(vendorRegex) && subSection[i].match(descriptionRegex) && subSection[i].match(amountRegex)) {
transferArray.push({
"transferCode": subSection[i].match(transferCodeRegex),
"details": [{
"voucherNumber": subSection[i].match(voucherNumberRegex),
"vendor": subSection[i].match(vendorRegex),
"description": subSection[i].match(descriptionRegex),
"total": subSection[i].match(amountRegex)
}]
})
} else {
transferArray.push({
"transferCode": subSection[i].match(transferCodeRegex),
"details": [{
"voucherNumber": subSection[i].match(voucherNumberRegex),
"description": subSection[i].match(oneLineDescRegex),
"total": subSection[i].match(oneLineAmountRegex)
}]
})
}
}
}
function removeNulls(obj) {
var isArray = obj instanceof Array;
for(var k in obj) {
console.log('k: ' + k);
if(obj[k] === null || obj[k] === undefined) isArray ? obj.splice(k, 1) : delete obj[k];
else if (typeof obj[k] === "object") removeNulls(obj[k]);
}
}
removeNulls(transferArray);
console.log(transferArray);
console.log(JSON.stringify(transferArray, null, 2))
function cleanData() {
transferArray.forEach(function(e) {
console.log(e)
e.details.forEach(function(evt) {
console.log(evt)
console.log(evt.amount)
console.log(evt.description)
for(i = 0; i < evt.amount.length; i++) {
// evt.amount[i] = evt.amount[i].toString();
// evt.amount[i] = evt.amount[i].replace(/^[a-zA-Z]\s+/g, '');
evt.amount[i] = parseFloat(evt.amount[i].replace(/\,/g, ""));
}
for(i = 0; i < evt.description.length; i++) {
evt.description[i] = evt.description[i].toString();
evt.description[i] = evt.description[i].trim();
}
return(evt);
})
// console.log(evt.amount);
// console.log(evt.description);
});
}
cleanData();
console.log(transferArray);
console.log(transferArray);
//adds detailed data to allData array
allData["section"] = transferArray;
extractDetails();
console.log(allData);
function pushArrayToObject() {
}
};
reader.readAsText(input.files[0]);
};
Upvotes: 0
Views: 687
Reputation: 750
Does that work for you?
As @stribizhev stated, JS doesn't support lookbehind. However, you can tweak your regex to get everything that is enclosed between " and then everything you have to do is to capture the single group returned.
This is the expression:
/^".*\s+([\d{1,3},]*\d{1,3}.\d{2})"$/;
So wherever you need to get the totalAmount
(assuming this is the value you're looking for), you can just do it in this way:
subSection[i].match(oneLineAmountRegex[1])
Upvotes: 1