Reputation: 1286
Tried this with both the standard regex library as well as the PCRE library in ANSI C under Linux:
Need to catch the content between brackets, multiple times in the same string, but I can only get the first one or it matches the whole line (non-greedy match).
src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\"
So the result I want is getting 3 substrings with:
int main(int argc, char *argv[]) {
pcre *re;
const char *error;
int erroffset;
int ovector[OVECCOUNT];
int rc, i;
char src [] = "device=\"device 1\" device_name=\"the first device" address=\"192.168.1.10\" device=\"device 2\" device_name=\"the second device" address=\"192.168.1.12\" device=\"device 3\" device_name=\"the third device" address=\"192.168.1.13\";
char pattern [] = ".+device=\"(.+(?R))\".+";
re = pcre_compile(pattern, 0, &error, &erroffset, NULL);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
return 1;
}
rc = pcre_exec(re, NULL, src, strlen(src), 0, 0, ovector, OVECCOUNT);
if (rc < 0) {
if (rc == PCRE_ERROR_NOMATCH) printf("Sorry, no match ...\n");
else printf("Matching error %d/n", rc);
free(re);
return 1;
}
printf("\nOK, has matched ...\n\n");
for (i = 0; i < rc; i++) {
char *substring_start = src + ovector[2*i];
int substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, substring_length, substring_start);
}
free(re);
return 0;
}
The regex 'testers' on the web can set the global flag which seems to work, but is not available in PCRE. What can I do?
Ideally I would prefer to use the standard regex.h lib, but PCRE is also fine if needed.
Upvotes: 0
Views: 286
Reputation: 119
Standard POSIX extended pattern (^|[\t\v\f\r ])device="([^"]*)"
works just fine. Then, the zeroth match is the entire match, the first match is the whitespace character preceding device=
or empty string if it starts at the beginning of the line, and the second match is the contents of the device name:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "(^|[\t\v\f\r ])device=\"([^\"]*)\"";
int main(void)
{
regex_t expression;
regmatch_t match[3];
int err, i;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 3, match, 0) == 0; i += match[0].rm_eo)
if (match[2].rm_so >= 0 && match[2].rm_eo > match[2].rm_so) {
const size_t off = i + match[2].rm_so;
const size_t len = match[2].rm_eo - match[2].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '\0';
printf("Matched '%s'.\n", part);
}
regfree(&expression);
return EXIT_SUCCESS;
}
As Jonathan Leffler mentioned in a comment to the question, the matches are obtained in a loop, where the next lookup starts where the previous match ended. The loop ends when there are no more matches.
If you want to support multiple quotation styles, you could use something like ^device="([^"]*)*"|^device='([^']*)'|^device=([^\t\v\f\r ]*)|[\t\v\f\r ]device="([^"]*)*"|[\t\v\f\r ]device='([^']*)'|[\t\v\f\r ]device=([^\t\v\f\r ]*)
, with at least seven elements in the match[]
array. Then, exactly one of the entries match[1]
to match[6]
will have .rm_so > 0
, and that will identify the desired contents:
#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <sys/types.h>
#include <regex.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
const char data[] = "device=\"device 1\" device_name=\"the first device\" address=\"192.168.1.10\""
" device=\"device 2\" device_name=\"the second device\" address=\"192.168.1.12\""
" device=\"device 3\" device_name=\"the third device\" address=\"192.168.1.13\"";
const char pattern[] = "^device=\"([^\"]*)\""
"|" "^device='([^']*)'"
"|" "^device=([^\t\v\f\r ]*)"
"|" "[\t\v\f\r ]device=\"([^\"]*)\""
"|" "[\t\v\f\r ]device='([^']*)'"
"|" "[\t\v\f\r ]device=([^\t\v\f\r ]*)";
int main(void)
{
regex_t expression;
regmatch_t match[7];
int err, i, k;
err = regcomp(&expression, pattern, REG_EXTENDED);
if (err) {
char errbuf[1024];
(void)regerror(err, &expression, errbuf, sizeof errbuf);
fprintf(stderr, "Invalid basic POSIX regular expression: %s.\n", errbuf);
return EXIT_FAILURE;
}
for (i = 0; regexec(&expression, data + i, 7, match, 0) == 0; i += match[0].rm_eo) {
for (k = 1; k < 7; k++)
if (match[k].rm_so >= 0)
break;
if (k >= 7)
continue;
if (match[k].rm_so >= 0 && match[k].rm_eo > match[k].rm_so) {
const size_t off = i + match[k].rm_so;
const size_t len = match[k].rm_eo - match[k].rm_so;
char part[len + 1];
memcpy(part, data + off, len);
part[len] = '\0';
printf("Matched '%s'.\n", part);
}
}
regfree(&expression);
return EXIT_SUCCESS;
}
However, this variant also detects the desired content when the data[]
is say
device="device 1" device_name="the first device" address="192.168.1.10"
device=device2 device_name=the_second_device address=192.168.1.12
device='device 3' device_name='the third device' address='192.168.1.13'
Personally, I would consider matching on pattern (^|[\t\v\f\r ])([A-Za-z0-9][-_a-Za-z0-9]*)=("[^"]*"|'[^']*'|[^\t\v\f\r ]*)
instead, so that the zeroth match matches each pair, first match the name part, and second match the value part possibly single- or double-quoted. Based on the name part, you could copy the value part (omitting the quotes if quoted) to dynamically allocated buffers.
Upvotes: 1