Reputation: 8030
In the code, a switch is used to convert letters to contiguous values. Optimizing compilers in general won't do the job as well as the simple contiguous digit condition right before the switch. How can I detect which execution character set used and/or conclude that the letters are contiguous to replace it with simple conditionals ?
static long digit_value(char c)
{
if (c >= '0' && c <= '9')
return (c-'0');
switch(c) {
case 'a':
case 'A':
return 10;
case 'b':
case 'B':
return 11;
case 'c':
case 'C':
return 12;
case 'd':
case 'D':
return 13;
case 'e':
case 'E':
return 14;
case 'f':
case 'F':
return 15;
case 'g':
case 'G':
return 16;
case 'h':
case 'H':
return 17;
case 'i':
case 'I':
return 18;
case 'j':
case 'J':
return 19;
case 'k':
case 'K':
return 20;
case 'l':
case 'L':
return 21;
case 'm':
case 'M':
return 22;
case 'n':
case 'N':
return 23;
case 'o':
case 'O':
return 24;
case 'p':
case 'P':
return 25;
case 'q':
case 'Q':
return 26;
case 'r':
case 'R':
return 27;
case 's':
case 'S':
return 28;
case 't':
case 'T':
return 29;
case 'u':
case 'U':
return 30;
case 'v':
case 'V':
return 31;
case 'w':
case 'W':
return 32;
case 'x':
case 'X':
return 33;
case 'y':
case 'Y':
return 34;
case 'z':
case 'Z':
return 35;
default:
break;
}
return -1;
}
Upvotes: 7
Views: 237
Reputation: 7490
First of all I have to say that I would choose the preprocessor solution, as the charset of the compiler would be an information I'd like to discover at compile time.
The _static_assert
would be an elegant solution, but since it has been introduced only with C11 standard, real dinosaurs are unlikely to support it.
I would perform the check with common preprocessor directives that is with a chain of #if
s each making sure that the representation of one char is contiguous to the one of the previous char.
The compiling time check has been well covered by the other answers, so I just want to suggest a simple runtime way for excluding EBCDIC charset: the assumption is a scenario in which the charset can either be EBCDIC or ASCII.
In this way, by excluding EBCDIC we can assume that the ASCII charset is used, so the characters are contiguous.
Just two simple EBCDIC features have to be considered:
'j' != 'i'+1
)So:
static long digit_value(char c)
{
if (c >= '0' && c <= '9')
return (c-'0');
if ( 'j' == 'i'+1 ) //if it's not EBCDIC
{
if( c >= 'a' && c <= 'z' )
{
return 10 + c - 'a';
}
else if ( c >= 'A' && c <= 'Z' )
{
return 10 + c - 'A';
}
}
return -1;
}
An error code is returned in case of EBCDIC charset, and in case of ASCII charset a simple conditional make sure that the range [10 - 35]
is returned for both lower case and upper case characters.
Upvotes: 0
Reputation: 67820
To be portable and most efficient use the lookup table
const char table[128] = {
['0'] = 0, ['1'] = 1, /* ... */ ['9'] = 9,
['a'] = 10, ['A'] = 10,
['b'] = 11, ['B'] = 11,
['c'] = 12, ['C'] = 12,
['d'] = 13, ['D'] = 13,
['e'] = 14, ['E'] = 14,
['f'] = 15, ['f'] = 15,
['g'] = 16, ['g'] = 16,
['h'] = 17, ['H'] = 17,
['i'] = 18, ['I'] = 18,
['j'] = 19, ['J'] = 19,
/* etc etc */
};
int get_value(char ch)
{
return table[ch & 0x7f];
}
and the generated code:
get_value:
movsx rdi, dil
movsx eax, BYTE PTR table[rdi]
ret
Upvotes: 0
Reputation: 154075
How can I detect which execution character set used and/or conclude that the letters are contiguous?
At compile time, simply test them all. ('a-z' left out for simplicity)
static_assert(
'A' == ('B' - 1) &&
'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) &&
'Y' == ('Z' - 1), "Dinosaur: not continuous A-Z");
static int digit_value(char c) {
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
return -1;
}
Or use the slow, but highly portable:
static int digit_value(char c) {
static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const char *p = strchr(base36, (unsigned char) c);
if (p && *p) {
return (int) (p - base36);
}
return -1;
}
Or perhaps a big #if
?
#if ('A' == ('B' - 1) && 'B' == ('C' - 1) && 'C' == ('D' - 1) && 'D' == ('E' - 1) && 'E' == ('F' - 1) && 'F' == ('G' - 1) && 'G' == ('H' - 1) && 'H' == ('I' - 1) && 'I' == ('J' - 1) && 'J' == ('K' - 1) && 'K' == ('L' - 1) && 'L' == ('M' - 1) && 'M' == ('N' - 1) && 'N' == ('O' - 1) && 'O' == ('P' - 1) && 'P' == ('Q' - 1) && 'Q' == ('R' - 1) && 'R' == ('S' - 1) && 'S' == ('T' - 1) && 'T' == ('U' - 1) && 'U' == ('V' - 1) && 'V' == ('W' - 1) && 'W' == ('X' - 1) && 'X' == ('Y' - 1) && 'Y' == ('Z' - 1))
static int digit_value(char c) {
if (c >= '0' && c <= '9') return c - '0';
if (c >= 'A' && c <= 'Z') return c - 'A' + 10;
return -1;
}
#else
static int digit_value(char c) {
static const char *base36 = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const char *p = strchr(base36, (unsigned char) c);
if (p && *p) {
return (int) (p - base36);
}
return -1;
}
#endif
Or .... if UCHAR_MAX
not too big and concern about speed, make a lookup table and skip the sequential concerns.
#include <limits.h>
int digit_value(char c) {
unsigned char val[UCHAR_MAX] = {['0'] = 1, ['1'] = 2, ['2'] = 3, ['3'] = 4,
['4'] = 5, ['5'] = 6, ['6'] = 7, ['7'] = 8, ['9'] = 10,
['A'] = 11, ['B'] = 12, ['C'] = 13, ['D'] = 14, ['E'] = 15, ...
['a'] = 11, ['b'] = 12, ['c'] = 13, ['d'] = 14, ['e'] = 15, ...
};
return val[(unsigned char) c] - 1;
}
Upvotes: 7
Reputation: 181104
You can write an appropriate test for conditional compilation, as @chux answered. However, if you need to support character sets with non-contiguous letters, then you need an implementation that supports that, and a table lookup would be a lot more efficient than what you present in the question. So much more so that you could consider using it for all cases instead of maintaining two separate implementations. For example:
static long digit_value(char c) {
static const long dvs[UCHAR_MAX + 1] = {
[0] = 1, [1] = 2, [2] = 3, [3] = 4, [5] = 6, [7] = 8, [8] = 9, [9] = 10,
['A'] = 11, ['B'] = 12, ['C'] = 13, ['D'] = 14, ['E'] = 15, ['F'] = 16, ['G'] = 17,
['H'] = 18, ['I'] = 19, ['J'] = 20, ['K'] = 21, ['L'] = 22, ['M'] = 23, ['N'] = 24,
['O'] = 25, ['P'] = 26, ['Q'] = 27, ['R'] = 28, ['S'] = 29, ['T'] = 30, ['U'] = 31,
['V'] = 32, ['W'] = 33, ['X'] = 34, ['Y'] = 35, ['Z'] = 36,
['a'] = 11, ['b'] = 12, ['c'] = 13, ['d'] = 14, ['e'] = 15, ['f'] = 16, ['g'] = 17,
['h'] = 18, ['i'] = 19, ['j'] = 20, ['k'] = 21, ['l'] = 22, ['m'] = 23, ['n'] = 24,
['o'] = 25, ['p'] = 26, ['q'] = 27, ['r'] = 28, ['s'] = 29, ['t'] = 30, ['u'] = 31,
['v'] = 32, ['w'] = 33, ['x'] = 34, ['y'] = 35, ['z'] = 36
};
return dvs[(unsigned char) c] - 1;
}
Note that the characters in the basic execution character set, which include all the decimal digits and upper- and lower-case Latin letters, are guaranteed to have non-negative integer values. That somewhat simplifies the initializer for the table. Note also that elements that are not explicitly initialized will be default-initialized to zero, which eventually gets converted to a -1 return value by subtracting 1 from the tabulated value.
Upvotes: 1