Reputation: 1371
I'm using the POSIX regular expressions regex.h
in C to count the number of appearances of a phrase in an English-language text fragment.
But the return value of regexec(...)
only tells if a match was found or not. So I tried to use the nmatch
and matchptr
to find distinct appearances, but when I printed out the matches from matchptr
, I just received the first index of first phrase appear in my text.
Here is my code:
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
#define MAX_MATCHES 20 //The maximum number of matches allowed in a single string
void match(regex_t *pexp, char *sz) {
regmatch_t matches[MAX_MATCHES];
if (regexec(pexp, sz, MAX_MATCHES, matches, 0) == 0) {
for(int i = 0; i < MAX_MATCHES; i++)
printf("\"%s\" matches characters %d - %d\n", sz, matches[i].rm_so, matches[i].rm_eo);
}
else {
printf("\"%s\" does not match\n", sz);
}
}
int main(int argc, char* argv[]) {
int rv;
regex_t exp;
rv = regcomp(&exp, "(the)", REG_EXTENDED | REG_ICASE);
if (rv != 0) {
printf("regcomp failed\n");
}
match(&exp, "the cat is in the bathroom.");
regfree(&exp);
return 0;
}
How can I make this code to report both of the two distinct matches of regular expression (the)
in the string the cat is in the bathroom
?
Upvotes: 3
Views: 2831
Reputation: 133929
You've understood the meaning of pmatch
incorrectly. It is not used for getting repeated pattern matches. It is used to get the location of the one match and its possible subgroups. As Linux manual for regcomp(3)
says:
The offsets of the subexpression starting at the
i
th open parenthesis are stored inpmatch[i]
. The entire regular expression's match addresses are stored inpmatch[0]
. (Note that to return the offsets of N subexpression matches,nmatch
must be at leastN+1
.) Any unused structure elements will contain the value-1
.
If you have the regular expression this (\w+) costs (\d+) USD
, there are 2 capturing groups in parentheses (\w+)
and (\d+)
; now if nmatch
was set to at least 3, pmatch[0]
would contain the start and end indices of the whole match, pmatch[1]
start and end for the (\w+)
group and pmatch[2]
for the (\d+)
group.
The following code should print the ranges of consecutive matches, if any (the beginning of each range is the offset of the first character inside the match, and the end of the range is the first character after the match, matching the rm_so
, rm_eo
usage); or the string "<the input string>" does not contain a match
if the pattern never matches.
It is carefully constructed so that it works for a zero-length regular expression as well (an empty regular expression, or say regular expression #?
will match at each character position including after the last character; 28 matches of that regular expression would be reported for input the cat is in the bathroom.
)
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
#include <string.h>
void match(regex_t *pexp, char *sz) {
// we just need the whole string match in this example
regmatch_t whole_match;
// we store the eflags in a variable, so that we can make
// ^ match the first time, but not for subsequent regexecs
int eflags = 0;
int match = 0;
size_t offset = 0;
size_t length = strlen(sz);
while (regexec(pexp, sz + offset, 1, &whole_match, eflags) == 0) {
// do not let ^ match again.
eflags = REG_NOTBOL;
match = 1;
printf("range %zd - %zd matches\n",
offset + whole_match.rm_so,
offset + whole_match.rm_eo);
// increase the starting offset
offset += whole_match.rm_eo;
// a match can be a zero-length match, we must not fail
// to advance the pointer, or we'd have an infinite loop!
if (whole_match.rm_so == whole_match.rm_eo) {
offset += 1;
}
// break the loop if we've consumed all characters. Note
// that we run once for terminating null, to let
// a zero-length match occur at the end of the string.
if (offset > length) {
break;
}
}
if (! match) {
printf("\"%s\" does not contain a match\n", sz);
}
}
int main(int argc, char* argv[]) {
int rv;
regex_t exp;
rv = regcomp(&exp, "(the)", REG_EXTENDED | REG_ICASE);
if (rv != 0) {
printf("regcomp failed\n");
}
match(&exp, "the cat is in the bathroom.");
regfree(&exp);
return 0;
}
P.S., the parentheses in your regex (the)
are unnecessary in this case; you could just write the
(and your initial confusion of getting 2 matches at same position was because you'd get one match for (the)
and one submatch for the
, had you not have had these parentheses, your code would have printed the location of first match only once).
Upvotes: 8