Reputation: 10727
I've been trying to detect whether or not a certain wchar_t
is considered a digit according to the current locale.
At first, I tried using iswdigit
, but then I realized it only returns 1
for 0-9
, so it isn't locale-sensitive at all.
Then I tried doing iswalnum(c) && !isalpha(c)
:
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#include <stdio.h>
int main() {
wchar_t c = L'٠'; // U0660: Arabic-Indic Digit 0
int len;
setlocale(LC_ALL, "");
printf("%d\n", iswalnum(c) && !iswalpha(c)); // Prints 1.
return 0;
}
Presumably, this would only be 1
if the character passes the alpha-or-number test but not the number-test. Only problem is, iswalpha(c)
returns 1
for some mysterious reason.
Which leads me to my question: is there a way to check if a character is a digit in a locale-sensitive way using only standard C or POSIX, without external libraries? Or am I just screwed?
Upvotes: 1
Views: 177
Reputation: 5525
As R.. already wrote, there is no build-in solution but I remembered that I did it in JavaScript a couple of years ago. A rough one-to-one translation to C:
#include <ctype.h>
#include <wchar.h>
// checks if it is a character resembling a digit
// between 0 (zero) and 9 (nine).
// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
// are not included.
// Restriction to 16-bit is arbitrary, no digits above at time of writing
// with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
int is_utf16_digit(wchar_t w)
{
// a small shortcut
if (isdigit((int)w)) {
return 1;
}
// a bit of range sectioning for legibility
if (w <= 0x9ef) {
if ( (w >= 0x0660 && w <= 0x0669)
|| (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
|| (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs
|| (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0x9ef && w <= 0xc6f) {
if ( (w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs
|| (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
|| (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
|| (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
|| (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0xc6f && w <= 0xf29) {
if ( (w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs
|| (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
|| (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
|| (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
|| (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
) {
return 1;
} else {
return 0;
}
} else if (w > 0xf29 && w <= 0xff19) {
if ( (w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs
|| (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
|| (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
|| (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
/*
0x2460 - 0x2468 CIRCLED DIGITs 1-9
0x24ea CIRCLED DIGITs 0
0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
0x2488 - 0x2490 DIGITs 1-9 FULL STOP
0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
and so on and so forth
*/
|| (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
/*
First characters larger than 16 bits:
(0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
0x10e60 - 0x10e68 RUMI DIGITs 1-9
Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
Forgotten by the committee?
First consecutive digits are the digits with a comma:
0x1f101 - 0x1f10a
*/
) {
return 1;
} else {
return 0;
}
} else {
return 0;
}
}
#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <string.h>
int main(int argc, char **argv)
{
wchar_t *w;
size_t slen, wlen;
if (argc != 2) {
fprintf(stderr, "Usage: %s string\n", argv[0]);
exit(EXIT_FAILURE);
}
setlocale(LC_ALL, "");
slen = strlen(argv[1]);
w = malloc(slen * (sizeof(wchar_t) + 1));
if (w == NULL) {
fprintf(stderr, "malloc() failed to allocate %zu bytes\n", slen);
exit(EXIT_FAILURE);
}
wlen = mbstowcs(w, argv[1], slen);
if (wlen == (size_t) - 1) {
fprintf(stderr, "mbstowcs() failed\n");
exit(EXIT_FAILURE);
}
// adjusting memory size with a round of realloc() ommited
while (*w != L'\0') {
if (is_utf16_digit(*w)) {
printf("\"%lc\" (0x%04x) is a digit\n", *w, *w);
} else {
printf("\"%lc\" (0x%04x) is not a digit\n", *w, *w);
}
w++;
}
exit(EXIT_SUCCESS);
}
Example with some characters representing the digit zero from different languages
$ ./isutf16digit "asd0asd٠asd೯asd0asd"
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0x0030) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"٠" (0x0660) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"೯" (0x0cef) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0xff10) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
That together with iswalpha()
should do it but as it is always the case with Unicode and standard-C without a specialized lib: it's doable but it's nasty. See above, for example: the code for testing is quite complicated and it even assumes a lot to keep it simple.
Upvotes: 0
Reputation: 215287
Regardless of locale, the only characters the C language permits to be in the digit
class are the characters '0'
, '1'
, '2'
, ... '9'
with values '0'+0
, '0'+1
, '0'+2
, ..., '0'+9
. If you want higher-level semantic information about what might be interpreted as a digit or other numeric form by a human using with cultural conventions matching the locale, you need a much higher-level semantic library.
As for why iswalnum(c) && !iswalpha(c)
didn't work, the alnum
class is defined as the union of alpha
and digit
. Existing locale authorship practice puts "digits" other than the standard decimal ones in the alpha
class simply because it's the path of least surprise and at least alnum
makes sense then.
Upvotes: 1