Reputation: 3563
Working on some existing code and I'm having an issue with a large regular expression (not my forte). The regex is only matching the expression for cities with one word (.ie. YORK instead of NEW YORK).
Sorry if this isn't clear the regex expression is only matching the second name of the city and not the entire city name. I apologize if this isn't clear, thanks.
I was fooling around with
@"(?<city>[a-zA-Z]+\s*){1,3},\s+(?<state>\w\w)\s+(?<zip>\d+)
for the city but to no avail.
Any help is greatly appreciated.
The Code
private void ReadPdfFile(string filePath)
{
var res = GetTextFromAllPages(filePath);
Regex typ = new Regex(@"\s\n(?<llctype>[\w\s\(\)]+)?---------[\-\s]+", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
Regex city = new Regex(@"(?<city>\w+?),\s+(?<state>\w\w)\s+(?<zip>\d+)", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
Regex data = new Regex(@"(?<filling>\d{8,})\s+(?<name>[^\n\r]*).*?law\s+:\s+203.*?county\s+:\s*(?<county>[\w]+)\s+(?<fileraddress>[^\n\r]+).*?EFF\..*?:\s(?<effdate>([\d/]+|\s+))\s(?<address2>[^\r\n]*).*?:\s(\s+|\w+)(?<zip>[^\r\n]+)\s+(?<processaddress>[^\r\n]+)\s+(?<processaddress2>[^\r\n]+)\s+(?<processzip>[^\r\n]+)\s", RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (var page in res)
{
if (typ.IsMatch(page))
{
var mtchTitle = typ.Match(page);
var title = mtchTitle.Groups["llctype"] != null ? mtchTitle.Groups["llctype"].Value : "";
var matchData = data.Matches(page);
foreach (Match m in matchData)
{
var fileradd2 = m.Groups["address2"] != null ? m.Groups["address2"].Value : "";
var processadd2 = m.Groups["processaddress2"] != null ? m.Groups["processaddress2"].Value : "";
var address = m.Groups["fileraddress"] != null ? m.Groups["fileraddress"].Value : "" + (fileradd2.IndexOf(" ") > -1 ? " " + fileradd2.Split(new string[] { " " }, StringSplitOptions.None)[0] : "");
var processaddress = m.Groups["processaddress"] != null ? m.Groups["processaddress"].Value : "" + (processadd2.IndexOf(" ") > -1 ? " " + processadd2.Split(new string[] { " " }, StringSplitOptions.None)[0] : "");
var filerstreet = (fileradd2.IndexOf(" ") > -1 ? " " + fileradd2.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries)[1] : fileradd2);
var processstreet = (processadd2.IndexOf(" ") > -1 ? " " + processadd2.Split(new string[] { " " }, StringSplitOptions.RemoveEmptyEntries)[1] : processadd2);
var cityMatchFiler = city.Match(m.Groups["zip"] != null ? m.Groups["zip"].Value : "");
var cityMatchProcess = city.Match(m.Groups["processzip"] != null ? m.Groups["processzip"].Value : "");
var d = new DocEntity
{
County = m.Groups["county"] != null ? m.Groups["county"].Value.Trim() : "",
EFFDate = m.Groups["effdate"] != null && !string.IsNullOrWhiteSpace(m.Groups["effdate"].Value) ? DateTime.Parse(m.Groups["effdate"].Value) : DateTime.MinValue,
FILER_ADDRESS = address.Trim(),
FILER_CITY = cityMatchFiler.Groups["city"] != null ? cityMatchFiler.Groups["city"].Value.Trim() : "",
FILER_STATE = cityMatchFiler.Groups["state"] != null ? cityMatchFiler.Groups["state"].Value.Trim() : "",
FILER_STREET = filerstreet.Trim(),
FILER_ZIP = cityMatchFiler.Groups["zip"] != null && !string.IsNullOrWhiteSpace(cityMatchFiler.Groups["zip"].Value) ? int.Parse(cityMatchFiler.Groups["zip"].Value) : 0,
FillingNumber = m.Groups["filling"] != null ? m.Groups["filling"].Value.Trim() : "",
LLC_NAME = m.Groups["name"] != null ? m.Groups["name"].Value.Trim() : "",
LLCType = title.Trim(),
ProcessAddress = processaddress.Trim(),
ProcessCity = cityMatchProcess.Groups["city"] != null ? cityMatchProcess.Groups["city"].Value.Trim() : "",
ProcessState = cityMatchProcess.Groups["state"] != null ? cityMatchProcess.Groups["state"].Value.Trim() : "",
ProcessStreet = processstreet.Trim(),
ProcessZIP = cityMatchProcess.Groups["zip"] != null && !string.IsNullOrWhiteSpace(cityMatchProcess.Groups["zip"].Value) ? int.Parse(cityMatchProcess.Groups["zip"].Value) : 0,
};
results.Add(d);
}
}
}
dataGridView1.DataSource = results;
}
Here is a sample of the text:
170914000673 FRESH TRESSES LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : QUEE EZ LEGAL PUBLISHING LLC
EFF. DATE: 09/14/2017 244 FIFTH AVENUE SUITE 2503
DUR. DATE: NEW YORK, NY 10001
170914000215 FYATIKTK, LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : KING PABLO G. VELEZ, ESQ.
EFF. DATE: 09/14/2017 VELEZ & CIPRIANO, PLLC 347 5TH AVENUE STE 810
DUR. DATE: NEW YORK, NY 10016
170914000676 GLOBAL PROPERTY GROUP LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : ROCK ALLSTATE CORPORATE SERVICES CORP.
EFF. DATE: 09/14/2017 ONE COMMERCE PLAZA 99 WASHINGTON AVENUE, SUITE 1008
DUR. DATE: ALBANY, NY 12260
170914000075 GNU SECURITIES LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : ALBA INTEGRATED MANAGEMENT
EFF. DATE: 09/14/2017 SOLUTIONS USA LLC 39 BROADWAY, STE 3300
DUR. DATE: NEW YORK, NY 10006
170914000129 HIGH HILL ASSOCIATES LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : WEST JOHN MORELLO
EFF. DATE: 09/14/2017 3 HIGH HILL FARM PL
DUR. DATE: THORNWOOD, NY 10594
170914000151 HOLLY SLEPT OVER, LLC
LAW : 206 LLC *FILER ADDRESS / PROCESS ADDRESS*
COUNTY : ONON GERMAIN & GERMAIN
EFF. DATE: 09/14/2017 314 E FAYETTE STREET
DUR. DATE: SYRACUSE, NY 13202
Upvotes: 0
Views: 800
Reputation: 357
Try (?<city>[^,]+),
That will capture anything not a comma and save it as the named capture "city"
You can also try something similar with negative-lookahead:
(?<city>(?!,\s+).+)
Not a c# programmer, but I dabble in regex :D
Upvotes: 1