kirbyfan64sos
kirbyfan64sos

Reputation: 10727

Detecting a locale-sensitive digit in C

I've been trying to detect whether or not a certain wchar_t is considered a digit according to the current locale.

At first, I tried using iswdigit, but then I realized it only returns 1 for 0-9, so it isn't locale-sensitive at all.

Then I tried doing iswalnum(c) && !isalpha(c):

#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#include <stdio.h>

int main() {
    wchar_t c = L'٠'; // U0660: Arabic-Indic Digit 0
    int len;
    setlocale(LC_ALL, "");
    printf("%d\n", iswalnum(c) && !iswalpha(c)); // Prints 1.
    return 0;
}

Presumably, this would only be 1 if the character passes the alpha-or-number test but not the number-test. Only problem is, iswalpha(c) returns 1 for some mysterious reason.

Which leads me to my question: is there a way to check if a character is a digit in a locale-sensitive way using only standard C or POSIX, without external libraries? Or am I just screwed?

Upvotes: 1

Views: 177

Answers (2)

deamentiaemundi
deamentiaemundi

Reputation: 5525

As R.. already wrote, there is no build-in solution but I remembered that I did it in JavaScript a couple of years ago. A rough one-to-one translation to C:

#include <ctype.h>
#include <wchar.h>
// checks if it is a character resembling a digit
// between 0 (zero) and 9 (nine).
// Numbers (http://www.fileformat.info/info/unicode/category/No/list.htm)
// are not included.
// Restriction to 16-bit is arbitrary, no digits above at time of writing
// with a range of 0-9 (except e.g.: digits with a full stop or a comma following)
int is_utf16_digit(wchar_t w)
{
  // a small shortcut
  if (isdigit((int)w)) {
    return 1;
  }
  // a bit of range sectioning for legibility
  if (w <= 0x9ef) {
    if (   (w >= 0x0660 && w <= 0x0669)
        || (w >= 0x06f0 && w <= 0x06f9) // EXTENDED ARABIC-INDIC DIGITs
        || (w >= 0x0966 && w <= 0x096f) // DEVANAGARI DIGITs
        || (w >= 0x09e6 && w <= 0x09ef) // BENGALI DIGITs
       ) {
      return 1;
    } else {
      return 0;
    }
  } else if (w > 0x9ef && w <= 0xc6f) {
    if (   (w >= 0x0a66 && w <= 0x0a6f) // GURMUKHI DIGITs
        || (w >= 0x0ae6 && w <= 0x0aef) // GUJARATI DIGITs
        || (w >= 0x0b66 && w <= 0x0b6f) // ORIYA DIGITs
        || (w >= 0x0be7 && w <= 0x0bef) // TAMIL DIGITs
        || (w >= 0x0c66 && w <= 0x0c6f) // TELUGU DIGITs
       ) {
      return 1;
    } else {
      return 0;
    }
  } else if (w > 0xc6f && w <= 0xf29) {
    if (   (w >= 0x0ce6 && w <= 0x0cef) // KANNADA DIGITs
        || (w >= 0x0d66 && w <= 0x0d6f) // MALAYALAM DIGITs
        || (w >= 0x0e50 && w <= 0x0e59) // THAI DIGITs
        || (w >= 0x0ed0 && w <= 0x0ed9) // LAO DIGITs
        || (w >= 0x0f20 && w <= 0x0f29) // TIBETAN DIGITs
       ) {
      return 1;
    } else {
      return 0;
    }
  } else if (w > 0xf29 && w <= 0xff19) {
    if (   (w >= 0x1040 && w <= 0x1049) // MYANMAR DIGITs
        || (w >= 0x1369 && w <= 0x1371) // ETHIOPIC DIGITs
        || (w >= 0x17e0 && w <= 0x17e9) // KHMER DIGITs
        || (w >= 0x1810 && w <= 0x1819) // MONGOLIAN DIGITs
/*
  0x2460 - 0x2468 CIRCLED DIGITs 1-9
  0x24ea          CIRCLED DIGITs 0
  0x2474 - 0x247c PARENTHESIZED DIGITs 1-9
  0x2488 - 0x2490 DIGITs 1-9 FULL STOP
  0x24f5 - 0x24fd DOUBLE CIRCLED DIGIT 1-9
  and so on and so forth
*/
        || (w >= 0xff10 && w <= 0xff19) // FULLWIDTH DIGITs (meh?)
/*
  First characters larger than 16 bits:
  (0x10107 - 0x1010f AEGEAN NUMBERs 1-9)
  0x102e1 - 0x102e9 COPTIC EPACT DIGITs 1-9
  0x10a40 - 0x10a43 KHAROSHTHI DIGITs 1-4
  0x10e60 - 0x10e68 RUMI DIGITs 1-9
  Interestingly, the zero for DIGITs 1-9 FULL STOP is at 0x1f100.
  Forgotten by the committee?
  First consecutive digits are the digits with a comma:
  0x1f101 - 0x1f10a
*/
       ) {
      return 1;
    } else {
      return 0;
    }
  } else {
    return 0;
  }
}

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>
#include <string.h>   
int main(int argc, char **argv)
{
  wchar_t *w;

  size_t slen, wlen;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s string\n", argv[0]);
    exit(EXIT_FAILURE);
  }

  setlocale(LC_ALL, "");

  slen = strlen(argv[1]);
  w = malloc(slen * (sizeof(wchar_t) + 1));
  if (w == NULL) {
    fprintf(stderr, "malloc() failed to allocate %zu bytes\n", slen);
    exit(EXIT_FAILURE);
  }

  wlen = mbstowcs(w, argv[1], slen);
  if (wlen == (size_t) - 1) {
    fprintf(stderr, "mbstowcs() failed\n");
    exit(EXIT_FAILURE);
  }
  // adjusting memory size with a round of realloc() ommited

  while (*w != L'\0') {
    if (is_utf16_digit(*w)) {
      printf("\"%lc\" (0x%04x) is a digit\n", *w, *w);
    } else {
      printf("\"%lc\" (0x%04x) is not a digit\n", *w, *w);
    }
    w++;
  }
  exit(EXIT_SUCCESS);
}

Example with some characters representing the digit zero from different languages

$ ./isutf16digit "asd0asd٠asd೯asd0asd"
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0x0030) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"٠" (0x0660) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"೯" (0x0cef) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit
"0" (0xff10) is a digit
"a" (0x0061) is not a digit
"s" (0x0073) is not a digit
"d" (0x0064) is not a digit

That together with iswalpha() should do it but as it is always the case with Unicode and standard-C without a specialized lib: it's doable but it's nasty. See above, for example: the code for testing is quite complicated and it even assumes a lot to keep it simple.

Upvotes: 0

R.. GitHub STOP HELPING ICE
R.. GitHub STOP HELPING ICE

Reputation: 215287

Regardless of locale, the only characters the C language permits to be in the digit class are the characters '0', '1', '2', ... '9' with values '0'+0, '0'+1, '0'+2, ..., '0'+9. If you want higher-level semantic information about what might be interpreted as a digit or other numeric form by a human using with cultural conventions matching the locale, you need a much higher-level semantic library.

As for why iswalnum(c) && !iswalpha(c) didn't work, the alnum class is defined as the union of alpha and digit. Existing locale authorship practice puts "digits" other than the standard decimal ones in the alpha class simply because it's the path of least surprise and at least alnum makes sense then.

Upvotes: 1

Related Questions