Python regex groupdict returns single characters instead of strings for groups

Question

I'm running up against a really confusing issue with Regex matching in Python. I have a pair of regex patterns that work fine in debugging tools such as regex101:

[Hex&Oct matching Pattern] (Code in testing window is the same as the file contents in the console test)
[Base64 matching Pattern] (Far from ideal, but the minimum length of 15 characters helps avoid false positives)
[Hex|Oct splitting Pattern] (Variation of Hex&Oct with different named groups)

However, once implemented in the script, the patterns fail to match anything unless both compiled and prepended with r before the opening quote.

Even then, the matches return single characters from the group dict.

Can anyone provide any pointers as to what I am doing wrong here?

deobf.py:

#!/bin/python
import sys
import getopt
import re
import base64

####################################################################################
#
# Setting up global vars and functions
#
####################################################################################

# Assemble Pattern Dictionary
pattern={}
pattern["HexOct"]=re.compile(r'(["\'])(?P(\[xX012]?[\dA-Fa-f]{2})*)\1')
pattern["Base64"]=re.compile(r'(["\'])(?P[\dA-Za-z/\+]{15,}={0,2})\1')

# Assemble more precise Pattern handling:
sub_pattern={}
sub_pattern["HexOct"]=re.compile(r'((?P\[xX][\dA-Fa-f]{2})|(?P\[012]?[\d]{2}))')

#print pattern # trying to Debug Pattern Dicts
#print sub_pattern # trying to Debug Pattern Dicts

# Global Var init
file_in=""
file_out=""
code_string=""
format_code = False

# Prints the Help screen
def usage():
    print "How to use deobf.py:"
    print "-----------------------------------------------------------\n"
    print "$ python deobf.py -i {inputfile.php} [-o {outputfile.txt}]\n"
    print "Other options include:"
    print "-----------------------------------------------------------"
    print "-f : Format - Format the output code with indentations"
    print "-h : Help - Prints this info\n"
    print "-----------------------------------------------------------"
    print "You can also use the long forms:"
    print "-i : --in"
    print "-o : --out"
    print "-f : --format"
    print "-h : --Help"

# Combination wrapper for the above two functions
def deHexOct(obf_code):
    match = re.search(sub_pattern["HexOct"],obf_code)
    if match:

        # Find and process Hex obfuscated elements
        for HexObj in match.groupdict()["Hex"]:
            print match.groupdict()["Hex"]
            print "Processing:"
            print HexObj.pattern
            obf_code.replace(HexObj,chr(int(HexObj),16))

        # Find and process Oct obfuscated elements
        for OctObj in set(match.groupdict()["Oct"]):
            print "Processing:"
            print OctObj
            obf_code.replace(OctObj,chr(int(OctObj),8))
    return obf_code

# Crunch the Data
def deObfuscate(file_string):
    # Identify HexOct sections and process
    match = re.search(pattern["HexOct"],file_string)
    if match:
        print "HexOct Obfuscation found."
        for HexOctObj in match.groupdict()["obf_code"]:
            print "Processing:"
            print HexOctObj
            file_string.replace(HexOctObj,deHexOct(HexOctObj))

    # Identify B64 sections and process
    match = re.search(pattern["Base64"],file_string)
    if match:
        print "Base64 Obfuscation found."
        for B64Obj in match.groupdict()["obf_code"]:
            print "Processing:"
            print B64Obj
            file_string.replace(B64Obj,base64.b64decode(B64Obj))

    # Return the (hopefully) deobfuscated string
    return file_string

# File to String
def loadFile(file_path):
    try:
        file_data = open(file_path)
        file_string = file_data.read()
        file_data.close()
        return file_string
    except ValueError,TypeError:
        print "[ERROR] Problem loading the File: " + file_path

# String to File
def saveFile(file_path,file_string):
    try:
        file_data = open(file_path,'w')
        file_data.write(file_string)
        file_data.close()
    except ValueError,TypeError:
        print "[ERROR] Problem saving the File: " + file_path

####################################################################################
#
# Main body of Script
#
####################################################################################
# Getting the args
try:
    opts, args = getopt.getopt(sys.argv[1:], "hi:o:f", ["help","in","out","format"])
except getopt.GetoptError:
    usage()
    sys.exit(2)

# Handling the args
for opt, arg in opts:
    if opt in ("-h", "--help"):
        usage()
        sys.exit()
    elif opt in ("-i", "--in"):
        file_in = arg
        print "Designated input file: "+file_in
    elif opt in ("-o", "--out"):
        file_out = arg
        print "Designated output file: "+file_out
    elif opt in ("-f", "--format"):
        format_code = True
        print "Code Formatting mode enabled"

# Checking the input   
if file_in =="":
    print "[ERROR] - No Input File Specified"
    usage()
    sys.exit(2)

# Checking or assigning the output
if file_out == "":
    file_out = file_in+"-deObfuscated.txt"
    print "[INFO] - No Output File Specified - Automatically assigned: "+file_out

# Zhu Li, Do the Thing!
code_string=loadFile(file_in)
deObf_String=deObfuscate(str(code_string))
saveFile(file_out,deObf_String)

The Console output from my debug prints is as follows:

C:\Users\NJB\workspace\python\deObf>deobf.py -i "Form 5138.php"
Designated input file: Form 5138.php
[INFO] - No Output File Specified - Automatically assigned: Form 5138.php-deObfuscated.txt
HexOct Obfuscation found.
Processing:
\
Processing:
x
Processing:
6
Processing:
1
Processing:
\
Processing:
1
Processing:
5
Processing:
6
Processing:
\
Processing:
x
Processing:
7
Processing:
5
Processing:
\
Processing:
1
Processing:
5
Processing:
6
Processing:
\
Processing:
x
Processing:
6
Processing:
1

Duncan · Accepted Answer

Your regular expression is matching groups just fine, but you are then iterating through the characters in the matched group.

This gives the string you just matched: match.groupdict()["Hex"]

This iterates over the characters in the string:

for HexObj in match.groupdict()["Hex"]:

You want instead to iterate the search, so use re.finditer() instead of re.search(). So something like:

def deHexOct(obf_code):
    for match in re.finditer(sub_pattern["HexOct"],obf_code):
        # Find and process Hex obfuscated elements
        groups = match.groupdict()
        hex = groups["Hex"]
        if hex:
            print "hex:", hex
            # do processing here
        oct = groups["Oct"]
        if oct:
            print "oct:", oct 
            # do processing here

Also, the r in front of the string just stops Python interpreting backslashes as escapes and is needed for regular expressions because they also use backslash for escapes. An alternative would be to double every backslash in your regex; then you wouldn't need the r prefix but the regex might become even less readable.

Python regex groupdict returns single characters instead of strings for groups

Answers (1)

Related Questions