ZeroPhase
ZeroPhase

Reputation: 647

pcre2 conditional replacement regex

I'm trying to write some regex for inserting box drawing characters based on conditionals, but I keep getting the compilation error subpattern name expected.

Here is my code:

int match_pkg_details(char **pkgdetail, char *pkginfo)
{
    PCRE2_SPTR pattern = (PCRE2_SPTR)"^(?!Name|Architecture|URL|Licenses|"\
                    "Installed Size|Packager|Build Date|"\
                    "Install Date|Install Script|Validated By| *$).*$";
    *pkgdetail = malloc(4096); // FIXME malloc in initializer
    char *worker = *pkgdetail;
    size_t pattern_length = strlen((char *)pattern);
    int errornumber;
    PCRE2_SIZE erroroffset;
    pcre2_code *regex = pcre2_compile(
            pattern,
            pattern_length,
            PCRE2_MULTILINE,
            &errornumber,
            &erroroffset,
            NULL);
    if (regex == NULL)
    {
        PCRE2_UCHAR buffer[256];
        pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
        printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
            buffer);
        return 1;
    }

    PCRE2_SPTR replacement = (PCRE2_SPTR)"(?(?=^Install Reason) a | ((?=(\\w) b | ((?=(\\s) c )))))";
                                                                                    // if starts with Install Reason replace with bottom line arrow }}}
    size_t replacement_length = strlen((char*)replacement);
    pcre2_code *replacement_regex = pcre2_compile(
            replacement,
            replacement_length,
            PCRE2_EXTENDED,
            &errornumber,
            &erroroffset,
            NULL);
    if (replacement_regex == NULL)
    {
        PCRE2_UCHAR buffer[256];
        pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
        printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
               buffer);
        return 1;
    }
    pcre2_match_data *match_data =
            pcre2_match_data_create_from_pattern(regex, NULL);

    PCRE2_SPTR subject = (PCRE2_SPTR)pkginfo;
    size_t length = strlen((char *)subject);

    PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
    ovector[1] = 0;

    int rc;
    PCRE2_SIZE offset = 0;
    uint32_t options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
    while (offset < length - 1 && (rc =
         pcre2_match(regex, subject, length, offset, options, match_data, NULL)))
    {
        offset = ovector[1];
        options = 0;

        if (rc == PCRE2_ERROR_NOMATCH)
        {
            ovector[1] = offset + 1;
            continue;
        }

        for (int i = 0; i < rc; i++)
        {
            PCRE2_SIZE worker_len = strlen(worker);
            PCRE2_UCHAR output[4096];
            PCRE2_SIZE outlen;
            int rs = pcre2_substitute(
                    replacement_regex,
                    subject,
                    length,
                    offset,
                    PCRE2_SUBSTITUTE_EXTENDED,
                    NULL,
                    NULL,
                    (PCRE2_SPTR)"@",
                    1,
                    output,
                    &outlen);
            PCRE2_SPTR substring_start = subject + ovector[2*i];
            size_t substring_length = ovector[2*i+1] - ovector[2*i];
            snprintf(worker, 4096, "%.*s\n", (int)substring_length, (char*)substring_start);
            worker += (int)substring_length + 1;
        }
    }

    pcre2_match_data_free(match_data);
    pcre2_code_free(regex);
    return 0;
}

The string I'm matching against:

Name            : cinnamon 
Version         : 3.4.6-1 
Description     : Linux desktop which provides advanced innovative features and 
                  a traditional user experience 
Architecture    : x86_64 
URL             : https://github.com/linuxmint/Cinnamon 
Licenses        : GPL2 
Groups          : None 
Provides        : None 
Depends On      : accountsservice  caribou  cinnamon-settings-daemon  
                  cinnamon-session cinnamon-translations  cjs  clutter-gtk 
                  gnome-backgrounds  gnome-themes-standard  gstreamer  
                  libgnome-keyring  libkeybinder3  librsvg  muffin  
                  python2-cairo  python-dbus  python2-dbus  python2-pillow  
                  python2-pam  python2-pexpect  python2-pyinotify  python2-lxml  
                  cinnamon-control-center  cinnamon-screensaver  cinnamon-menus                   
                  libgnomekbd  network-manager-applet  nemo  polkit-gnome  xapps  
                  python2-gobject 
Optional Deps   : blueberry: Bluetooth support [installed]
                  gnome-panel: fallback mode
                  metacity: fallback mode
                  system-config-printer: printer settings [installed] 
Required By     : cinnamon-sound-effects 
Optional For    : None
Conflicts With  : None 
Replaces        : None 
Installed Size  : 8.31 MiB 
Packager        : Antonio Rojas <[email protected]> 
Build Date      : Sat 09 Sep 2017 05:38:21 AM CDT 
Install Date    : Sat 09 Sep 2017 11:37:44 AM CDT 
Install Reason  : Installed as a dependency for another package 
Install Script  : No 
Validated By    : Signature

Currently, if I remove the replacement groups I get:

Version         : 3.4.6-1
Description     : Linux desktop which provides advanced innovative features
                    and a traditional user experience
Provides        : None
Depends On      : accountsservice  caribou  cinnamon-settings-daemon
                  cinnamon-session  cinnamon-translations  cjs  clutter-gtk  gnome-backgrounds
                  gnome-themes-standard  gstreamer  libgnome-keyring  libkeybinder3  librsvg
                  muffin  python2-cairo  python-dbus  python2-dbus  python2-pillow  python2-pam
                  python2-pexpect  python2-pyinotify  python2-lxml  cinnamon-control-center
                  cinnamon-screensaver  cinnamon-menus  libgnomekbd  network-manager-applet
                  nemo  polkit-gnome  xapps  python2-gobject
Optional Deps   : blueberry: Bluetooth support [installed]
Required By     : cinnamon-sound-effects
Optional For    : None
Conflicts With  : None
Replaces        : None
Install Reason  : Installed as a dependency for another package

The intended output looks like:

├─ Version         : 3.4.6-1
├─ Description     : Linux desktop which provides advanced innovative features
│                    and a traditional user experience
├─ Provides        : None
├─ Depends On      : accountsservice  caribou  cinnamon-settings-daemon
│                    cinnamon-session  cinnamon-translations  cjs  clutter-gtk  gnome-backgrounds
│                    gnome-themes-standard  gstreamer  libgnome-keyring  libkeybinder3  librsvg
│                    muffin  python2-cairo  python-dbus  python2-dbus  python2-pillow  python2-pam
│                    python2-pexpect  python2-pyinotify  python2-lxml  cinnamon-control-center
│                    cinnamon-screensaver  cinnamon-menus  libgnomekbd  network-manager-applet
│                    nemo  polkit-gnome  xapps  python2-gobject
├─ Optional Deps   : blueberry: Bluetooth support [installed]
├─ Required By     : cinnamon-sound-effects
├─ Optional For    : None
├─ Conflicts With  : None
├─ Replaces        : None
└─ Install Reason  : Installed as a dependency for another package

a, b, and c are just there for testing purposes (I think I should replace them with named capture groups). I'll be breaking the regex_compile sections out to it's own method once I get the replacement working correctly. How can I replace named groups with pcre2_substitute?

Upvotes: 2

Views: 2008

Answers (1)

Lucas Trzesniewski
Lucas Trzesniewski

Reputation: 51330

You're trying to do your logic in the wrong place. You need to handle it in the substitution pattern, not in the regex pattern itself.

First, let's write a pattern which will identify the different parts of your string:

^(?:
    (?<remove>(?:
        Name|Architecture|URL|Licenses|
        Installed[ ]Size|Packager|Build[ ]Date|
        Install[ ]Date|Install[ ]Script|Validated[ ]By
    )\s*:[^\n]*\n)
    |(?<last>(?=Install[ ]Reason\s*:))
    |(?<field>(?=\S))
    |(?<cont>(?=\s))
)

Demo

That's with the mx options (PCRE2_MULTILINE | PCRE2_EXTENDED), but we won't really need PCRE2_EXTENDED in the C code.

This will identify some parts of the string and fill exactly one named capture group in the result:

  • remove for parts to remove
  • last for that last field
  • field for other fields
  • cont for value continuations (a line without a field label)

Next, we'll have to replace each of these parts with a different string:

  • remove => (empty string)
  • last => └─ (I'll be using \- instead in the program below)
  • field => ├─ (I'll be using +- instead in the program below)
  • cont => (I'll be using | instead in the program below)

We can let PCRE handle that through PCRE2_SUBSTITUTE_EXTENDED (docs):

The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to group substitution. The syntax is similar to that used by Bash:

${<n>:-<string>}
${<n>:+<string1>:<string2>}

As before, <n> may be a group number or a name. The first form specifies a default value. If group <n> is set, its value is inserted; if not, <string> is expanded and the result inserted. The second form specifies strings that are expanded and inserted when group <n> is set or unset, respectively. The first form is just a convenient shorthand for

${<n>:+${<n>}:<string>}

So, using that syntax, our replacement string looks like this:

${remove:+:${last:+\\- :${field:++- :${cont:+|  :}}}}

Here's a full demo:

#include <stdio.h>

#define PCRE2_CODE_UNIT_WIDTH 8
#include <pcre2.h>

PCRE2_SPTR input =
    "Name            : cinnamon\n"
    "Version         : 3.4.6-1\n"
    "Description     : Linux desktop which provides advanced innovative features and\n"
    "                  a traditional user experience\n"
    "Architecture    : x86_64\n"
    "URL             : https://github.com/linuxmint/Cinnamon\n"
    "Licenses        : GPL2\n"
    "Groups          : None\n"
    "Provides        : None\n"
    "Depends On      : accountsservice  caribou  cinnamon-settings-daemon\n"
    "                  cinnamon-session cinnamon-translations  cjs  clutter-gtk\n"
    "                  gnome-backgrounds  gnome-themes-standard  gstreamer \n"
    "                  libgnome-keyring  libkeybinder3  librsvg  muffin \n"
    "                  python2-cairo  python-dbus  python2-dbus  python2-pillow\n"
    "                  python2-pam  python2-pexpect  python2-pyinotify  python2-lxml\n"
    "                  cinnamon-control-center  cinnamon-screensaver  cinnamon-menus\n"
    "                  libgnomekbd  network-manager-applet  nemo  polkit-gnome  xapps\n"
    "                  python2-gobject\n"
    "Optional Deps   : blueberry: Bluetooth support [installed]\n"
    "                  gnome-panel: fallback mode\n"
    "                  metacity: fallback mode\n"
    "                  system-config-printer: printer settings [installed]\n"
    "Required By     : cinnamon-sound-effects\n"
    "Optional For    : None\n"
    "Conflicts With  : None\n"
    "Replaces        : None\n"
    "Installed Size  : 8.31 MiB\n"
    "Packager        : Antonio Rojas <[email protected]>\n"
    "Build Date      : Sat 09 Sep 2017 05:38:21 AM CDT\n"
    "Install Date    : Sat 09 Sep 2017 11:37:44 AM CDT\n"
    "Install Reason  : Installed as a dependency for another package\n"
    "Install Script  : No\n"
    "Validated By    : Signature\n";

PCRE2_SPTR pattern =
    "^(?:"
        "(?<remove>(?:"
            "Name|Architecture|URL|Licenses|"
            "Installed Size|Packager|Build Date|"
            "Install Date|Install Script|Validated By"
        ")\\s*:[^\n]*\n)"
        "|(?<last>(?=Install Reason\\s*:))"
        "|(?<field>(?=\\S))"
        "|(?<cont>(?=\\s))"
    ")";

PCRE2_SPTR replacement =
    "${remove:+:${last:+\\\\- :${field:++- :${cont:+|  :}}}}";

static void print_error(int code)
{
    PCRE2_UCHAR message[256];
    if (pcre2_get_error_message(code, &message, sizeof(message) / sizeof(PCRE2_UCHAR)))
        puts(message);
}

int main()
{
    pcre2_code *re;
    pcre2_match_context *match_context;
    int result, error;
    PCRE2_SIZE erroffset, outlength;
    PCRE2_UCHAR* outbuf;

    re = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, PCRE2_MULTILINE, &error, &erroffset, 0);
    if (!re)
    {
        print_error(error);
        return 1;
    }

    match_context = pcre2_match_context_create(0);

    outlength = 0;
    result = pcre2_substitute(
        re,
        input,
        PCRE2_ZERO_TERMINATED,
        0,
        PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_EXTENDED,
        0,
        match_context,
        replacement,
        PCRE2_ZERO_TERMINATED,
        0,
        &outlength
    );

    if (result != PCRE2_ERROR_NOMEMORY)
    {
        print_error(result);
        return ;
    }

    outbuf = malloc(outlength * sizeof(PCRE2_UCHAR));

    result = pcre2_substitute(
        re,
        input,
        PCRE2_ZERO_TERMINATED,
        0,
        PCRE2_SUBSTITUTE_GLOBAL | PCRE2_SUBSTITUTE_EXTENDED,
        0,
        match_context,
        replacement,
        PCRE2_ZERO_TERMINATED,
        outbuf,
        &outlength
    );

    if (result < 0)
    {
        print_error(result);
        return;
    }

    puts(outbuf);

    free(outbuf);
    pcre2_match_context_free(match_context);
    pcre2_code_free(re);

    return 0;
}

The output is:

+- Version         : 3.4.6-1
+- Description     : Linux desktop which provides advanced innovative features and
|                    a traditional user experience
+- Groups          : None
+- Provides        : None
+- Depends On      : accountsservice  caribou  cinnamon-settings-daemon
|                    cinnamon-session cinnamon-translations  cjs  clutter-gtk
|                    gnome-backgrounds  gnome-themes-standard  gstreamer
|                    libgnome-keyring  libkeybinder3  librsvg  muffin
|                    python2-cairo  python-dbus  python2-dbus  python2-pillow
|                    python2-pam  python2-pexpect  python2-pyinotify  python2-lxml
|                    cinnamon-control-center  cinnamon-screensaver  cinnamon-menus
|                    libgnomekbd  network-manager-applet  nemo  polkit-gnome  xapps
|                    python2-gobject
+- Optional Deps   : blueberry: Bluetooth support [installed]
|                    gnome-panel: fallback mode
|                    metacity: fallback mode
|                    system-config-printer: printer settings [installed]
+- Required By     : cinnamon-sound-effects
+- Optional For    : None
+- Conflicts With  : None
+- Replaces        : None
\- Install Reason  : Installed as a dependency for another package

I think I should mention that in your case it would certainly be easier to just do the string manipulation by hand rather than going through a regex pattern.

Upvotes: 4

Related Questions