Reputation: 15
I'm scrapping a email address in a file with regex.
Unfortunately my regex rule can not match with to this string :
" <font size=-1><a href=mailto:[email protected]>_ MR NOURS _</a></font> "
;
I'm failling to find the reason why on stackoverflow, I hope someone could telle what is wrong on my rule.
This is my code to test it:
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
int main (void)
{
int match;
int err;
regex_t preg;
regmatch_t pmatch[5];
size_t nmatch = 5;
const char *str_request = " <font size=-1><a href=mailto:[email protected]>_ MR NOURS _</a></font> ";
const char *str_regex = "[a-zA-Z0-9][a-zA-Z0-9_.]+@[a-zA-Z0-9_]+\\.(com|net|[a-zA-Z]{2})$";
err = regcomp(&preg, str_regex, REG_EXTENDED);
if (err == 0)
{
match = regexec(&preg, str_request, nmatch, pmatch, 0);
nmatch = preg.re_nsub;
regfree(&preg);
if (match == 0)
{
printf ("match\n");
int start = pmatch[0].rm_so;
int end = pmatch[0].rm_eo;
printf("%d - %d\n", start, end);
}
else if (match == REG_NOMATCH)
{
printf("unmatch\n");
}
}
puts ("\nPress any key\n");
getchar ();
return (EXIT_SUCCESS);
}
Upvotes: 0
Views: 81
Reputation: 627317
I suspect you are trying to match the substring as a whole word, thus, you used $
(end-of-string) anchor at the end of the pattern. However, the substring you are looking for is not at the end of the input string.
Since regex.h
does not support word boundaries, you can use a workaround:
const char *str_regex = "([a-zA-Z0-9][a-zA-Z0-9_.]+@[a-zA-Z0-9_]+\\.(com|net|[a-zA-Z]{2}))([^a-zA-Z]|$)";
^^^^^^^^^^^^^
The value you need will reside in the capture group 1.
See this C IDEONE demo:
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
int main (void)
{
int match;
int err;
regex_t preg;
regmatch_t pmatch[5];
size_t nmatch = 4; // We have 4 groups as a result of matching: 0 - the whole match, and 3 capture groups
const char *str_request = " <font size=-1><a href=mailto:[email protected]>_ MR NOURS _</a></font> ";
const char *str_regex = "([a-zA-Z0-9][a-zA-Z0-9_.]+@[a-zA-Z0-9_]+\\.(com|net|[a-zA-Z]{2}))([^a-zA-Z]|$)";
err = regcomp(&preg, str_regex, REG_EXTENDED);
if (err == 0)
{
match = regexec(&preg, str_request, nmatch, pmatch, 0);
nmatch = preg.re_nsub;
regfree(&preg);
if (match == 0)
{
printf ("match\n");
int start = pmatch[1].rm_so; // <- Changed from 0 to 1
int end = pmatch[1].rm_eo; // <- Changed from 0 to 1
printf("%d - %d\n\"%.*s\"", start, end, pmatch[1].rm_eo - pmatch[1].rm_so, &str_request[pmatch[1].rm_so]);
} // ^--^ Added a captured substring display
else if (match == REG_NOMATCH)
{
printf("unmatch\n");
}
}
puts ("\nPress any key\n");
getchar ();
return (EXIT_SUCCESS);
}
Or just remove the $
if you do not care about whole word matching.
Upvotes: 2