cdleary
cdleary

Reputation: 71424

C++ string diff (a la Python's difflib)

I'm trying to diff two strings to determine whether or not they solely vary in one numerical subset of the string structure; for example,

varies_in_single_number_field('foo7bar', 'foo123bar')
# Returns True, because 7 != 123, and there's only one varying
# number region between the two strings.

In Python I can use the difflib to accomplish this:

import difflib, doctest

def varies_in_single_number_field(str1, str2):
    """
    A typical use case is as follows:
        >>> varies_in_single_number_field('foo7bar00', 'foo123bar00')
        True

    Numerical variation in two dimensions is no good:
        >>> varies_in_single_number_field('foo7bar00', 'foo123bar01')
        False

    Varying in a nonexistent field is okay:
        >>> varies_in_single_number_field('foobar00', 'foo123bar00')
        True

    Identical strings don't *vary* in any number field:
        >>> varies_in_single_number_field('foobar00', 'foobar00')
        False
    """
    in_differing_substring = False
    passed_differing_substring = False # There should be only one.
    differ = difflib.Differ()
    for letter_diff in differ.compare(str1, str2):
        letter = letter_diff[2:]
        if letter_diff.startswith(('-', '+')):
            if passed_differing_substring: # Already saw a varying field.
                return False
            in_differing_substring = True
            if not letter.isdigit(): return False # Non-digit diff character.
        elif in_differing_substring: # Diff character not found - end of diff.
            in_differing_substring = False
            passed_differing_substring = True
    return passed_differing_substring # No variation if no diff was passed.

if __name__ == '__main__': doctest.testmod()

But I have no idea how to find something like difflib for C++. Alternative approaches welcome. :)

Upvotes: 4

Views: 2949

Answers (5)

Evan Teran
Evan Teran

Reputation: 90422

This might work, it at least passes your demonstration test: EDIT: I've made some modifications to deal with some string indexing issues. I believe it should be good now.

#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <cctype>

bool starts_with(const std::string &s1, const std::string &s2) {
    return (s1.length() <= s2.length()) && (s2.substr(0, s1.length()) == s1);
}

bool ends_with(const std::string &s1, const std::string &s2) {
    return (s1.length() <= s2.length()) && (s2.substr(s2.length() - s1.length()) == s1);
}

bool is_numeric(const std::string &s) {
    for(std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
        if(!std::isdigit(*it)) {
                return false;
        }
    }
    return true;
}

bool varies_in_single_number_field(std::string s1, std::string s2) {

    size_t index1 = 0;
    size_t index2 = s1.length() - 1;

    if(s1 == s2) {
        return false;
    }

    if((s1.empty() && is_numeric(s2)) || (s2.empty() && is_numeric(s1))) {
        return true;
    }

    if(s1.length() < s2.length()) {
        s1.swap(s2);
    }

    while(index1 < s1.length() && starts_with(s1.substr(0, index1), s2)) { index1++; }
    while(ends_with(s1.substr(index2), s2)) { index2--; }

    return is_numeric(s1.substr(index1 - 1, (index2 + 1) - (index1 - 1)));

}

int main() {
    std::cout << std::boolalpha << varies_in_single_number_field("foo7bar00", "foo123bar00") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("foo7bar00", "foo123bar01") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("foobar00", "foo123bar00") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("foobar00", "foobar00") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("7aaa", "aaa") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("aaa7", "aaa") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("aaa", "7aaa") << std::endl;
    std::cout << std::boolalpha << varies_in_single_number_field("aaa", "aaa7") << std::endl;
}

Basically, it looks for a string which has 3 parts, string2 begins with part1, string2 ends with part3 and part2 is only digits.

Upvotes: 2

cdleary
cdleary

Reputation: 71424

@Evan Teran: looks like we did this in parallel -- I have a markedly less readable O(n) implementation:

#include <cassert>
#include <cctype>
#include <string>
#include <sstream>
#include <iostream>

using namespace std;

ostringstream debug;
const bool DEBUG = true;

bool varies_in_single_number_field(const string &str1, const string &str2) {
    bool in_difference = false;
    bool passed_difference = false;
    string str1_digits, str2_digits;
    size_t str1_iter = 0, str2_iter = 0;
    while (str1_iter < str1.size() && str2_iter < str2.size()) {
        const char &str1_char = str1.at(str1_iter);
        const char &str2_char = str2.at(str2_iter);
        debug << "str1: " << str1_char << "; str2: " << str2_char << endl;
        if (str1_char == str2_char) {
            if (in_difference) {
                in_difference = false;
                passed_difference = true;
            }
            ++str1_iter, ++str2_iter;
            continue;
        }
        in_difference = true;
        if (passed_difference) { /* Already passed a difference. */
            debug << "Already passed a difference." << endl;
            return false;
        }
        bool str1_char_is_digit = isdigit(str1_char);
        bool str2_char_is_digit = isdigit(str2_char);
        if (str1_char_is_digit && !str2_char_is_digit) {
            ++str1_iter;
            str1_digits.push_back(str1_char);
        } else if (!str1_char_is_digit && str2_char_is_digit) {
            ++str2_iter;
            str2_digits.push_back(str2_char);
        } else if (str1_char_is_digit && str2_char_is_digit) {
            ++str1_iter, ++str2_iter;
            str1_digits.push_back(str1_char);
            str2_digits.push_back(str2_char);
        } else { /* Both are non-digits and they're different. */
            return false;
        }
    }
    if (in_difference) {
        in_difference = false;
        passed_difference = true;
    }
    string str1_remainder = str1.substr(str1_iter);
    string str2_remainder = str2.substr(str2_iter);
    debug << "Got to exit point; passed difference: " << passed_difference
        << "; str1 digits: " << str1_digits
        << "; str2 digits: " << str2_digits
        << "; str1 remainder: " << str1_remainder
        << "; str2 remainder: " << str2_remainder
        << endl;
    return passed_difference
        && (str1_digits != str2_digits)
        && (str1_remainder == str2_remainder);
}

int main() {
    assert(varies_in_single_number_field("foo7bar00", "foo123bar00") == true);
    assert(varies_in_single_number_field("foo7bar00", "foo123bar01") == false);
    assert(varies_in_single_number_field("foobar00", "foo123bar00") == true);
    assert(varies_in_single_number_field("foobar00", "foobar00") == false);
    assert(varies_in_single_number_field("foobar00", "foobaz00") == false);
    assert(varies_in_single_number_field("foo00bar", "foo01barz") == false);
    assert(varies_in_single_number_field("foo01barz", "foo00bar") == false);
    if (DEBUG) {
        cout << debug.str();
    }
    return 0;
}

Upvotes: 0

MP24
MP24

Reputation: 3200

How about using something like boost::regex?

// pseudo code, may or may not compile
bool match_except_numbers(const std::string& s1, const std::string& s2)
{
    static const boost::regex fooNumberBar("foo\\d+bar");
    return boost::match(s1, fooNumberBar) && boost::match(s2, fooNumberBar);
}

Upvotes: 0

Jesse Beder
Jesse Beder

Reputation: 34034

You could do an ad hoc approach: You're looking to match strings s and s', where s=abc and s'=ab'c, and the b and b' should be two distinct numbers (possible empty). So:

  1. Compare the strings from the left, char by char, until you hit different characters, and then stop. You
  2. Similarly, compare the strings from the right until you hit different characters, OR hit that left marker.
  3. Then check the remainders in the middle to see if they're both numbers.

Upvotes: 1

Douglas Mayle
Douglas Mayle

Reputation: 21747

It's probably a bit of overkill, but you could use boost to interface to python. At the worst, difflib is implemented in pure python, and it's not too long. It should be possible to port from python to C...

Upvotes: 1

Related Questions