Reputation: 35
I made a function that converts upper case to lower case using C.
#include <stdio.h>
#include <stdlib.h>
//İi Iı Ğğ Şş Çç Üü Öö
char toUpLow(char letter)
{
if (letter >= 'A' && letter <= 'Z') {
return letter - 'A' + 'a';
}
else if (letter >= 'a' && letter <= 'z') {
return letter - 'a' + 'A';
}
else {
return -1;
}
}
int main()
{
char myChar;
printf("Enter a character: ");
scanf("%c", &myChar);
printf("%c", toUpLow(myChar));
return 0;
}
I want to add Turkish letters.
char toUpLow(char letter)
{
if (letter == 'İ') {
printf("i");
}
else if (letter == 'i')
{
printf("İ");
}
else if (letter == 'I')
{
printf("ı");
}
else if (letter == 'ı')
{
printf("I");
}
else if (letter == 'Ğ')
{
printf("ğ");
}
else if (letter == 'ğ')
{
printf("Ğ");
}
else if (letter >= 'A' && letter <= 'Z') {
return letter - 'A' + 'a';
}
else if (letter >= 'a' && letter <= 'z') {
return letter - 'a' + 'A';
}
else {
return -1;
}
}
I tried to add Turkish letters with if
/ else
and I got this error:
uplowfunction.c:22:24: warning: multi-character character constant [-Wmultichar]
Is there a way to do this?
Upvotes: 3
Views: 301
Reputation: 33601
Turkish "characters" aren't a single byte. They are a UTF-8 sequence of 1-4 bytes. See: https://en.wikipedia.org/wiki/UTF-8
The "size" of a given "character" (i.e. codepoint) can vary. That is, the length of the original character could be different than the length of the translated character. (For example: İ
is two bytes but i
is one byte).
So, instead of passing char
arguments, it's better to pass char *
arguments and use partial string comparisons.
So, I've changed your function to accept source and destination strings. And, the strings can be entire words, phrases, sentences, etc.
Here is the refactored code. It is annotated:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if DEBUG
#define dbgprt(_fmt...) \
printf(_fmt)
#else
#define dbgprt(_fmt...) \
do { } while (0)
#endif
struct xlat {
const char *old;
int olen;
const char *new;
int nlen;
};
#define XLAT1(_src,_dst) \
{ .old = _src, .new = _dst }
#define XLAT2(_src,_dst) \
XLAT1(_src,_dst), \
XLAT1(_dst,_src)
//İi Iı Ğğ Şş Çç Üü Öö
static struct xlat xlatlist[] = {
XLAT2("İ","i"),
XLAT2("I","ı"),
XLAT2("Ğ","ğ"),
XLAT2("Ş","ş"),
XLAT2("Ç","ç"),
XLAT2("Ü","ü"),
XLAT2("Ö","ö"),
XLAT1(NULL,NULL)
};
// utf8len -- get length of next/leftmost UTF8 character
int
utf8len(const char *str)
{
int len = 1;
dbgprt("utf8len: ENTER str='%s'\n",str);
do {
// ordinary ASCII char
if ((str[0] & 0x80) == 0)
break;
// skip over start character
++str;
for (; *str != 0; ++str, ++len) {
// still [possibly] part of the same multichar or is it ascii?
if ((*str & 0x80) == 0)
break;
// don't include this if it's the start of the _next_ multichar
if ((*str & 0x40) != 0)
break;
}
} while (0);
dbgprt("utf8len: EXIT len=%d\n",len);
return len;
}
// xlatchar -- decide if we match a given translation
int
xlatchar(const struct xlat *xlat,const char *str,int slen)
{
int match = 0;
dbgprt("xlatchar: ENTER old=%d/'%s' new=%d/'%s' str='%s' slen=%d\n",
xlat->olen,xlat->old,xlat->nlen,xlat->new,str,slen);
do {
// codepoint lengths must match
match = (xlat->olen == slen);
if (! match)
break;
// string prefixes must match
match = (memcmp(xlat->old,str,xlat->olen) == 0);
if (! match)
break;
} while (0);
dbgprt("xlatchar: EXIT match=%d\n",match);
return match;
}
void
toUpLow(char *buf,const char *str)
{
dbgprt("toUpLow: ENTER str='%s'\n",str);
// calculate string lengths in table (once)
static int inited = 0;
if (! inited) {
inited = 1;
dbgprt("toUpLow: XLATINIT\n");
struct xlat *xlat = xlatlist;
for (; xlat->old != NULL; ++xlat) {
xlat->olen = utf8len(xlat->old);
xlat->nlen = utf8len(xlat->new);
}
}
char *dst = buf;
int slen;
int dlen;
for (; *str != 0; str += slen, dst += dlen) {
// get length of first [remaining] char
slen = utf8len(str);
dlen = slen;
// look for match in translation table
int match = 0;
const struct xlat *xlat = xlatlist;
for (; xlat->old != NULL; ++xlat) {
// match on current entry?
match = xlatchar(xlat,str,slen);
// if yes, copy out the _translated_ character
if (match) {
dbgprt("toUpLow: MATCH nlen=%d\n",xlat->nlen);
strcpy(dst,xlat->new);
dlen = xlat->nlen;
break;
}
}
if (match)
continue;
// a multichar that is _not_ part of the translation table
if (slen != 1) {
memcpy(dst,str,slen);
continue;
}
// do standard toupper/tolower equivalent
int letter = *str;
dlen = 1;
if (letter >= 'A' && letter <= 'Z') {
dbgprt("toUpLow: UP2LO\n");
*dst = letter - 'A' + 'a';
continue;
}
if (letter >= 'a' && letter <= 'z') {
dbgprt("toUpLow: LO2UP\n");
*dst = letter - 'a' + 'A';
continue;
}
*dst = letter;
}
*dst = 0;
dbgprt("toUpLow: EXIT buf='%s'\n",buf);
}
void
test(const char *str)
{
char buf[1000];
printf("\n");
dbgprt("BEG: %s\n",str);
toUpLow(buf,str);
printf("OLD: %s\n",str);
printf("NEW: %s\n",buf);
}
int
main()
{
char myChar[20];
test("//İi Iı Ğğ Şş Çç Üü Öö AbCDef");
test("Quick Brown tilki, LAZY Dogs'un üzerinden atladı");
test("Mary küçük bir kuzuya sahipti");
test("Postu kar kadar beyazdı");
test("Ve Mary'nin gittiği her yerde");
test("Kuzusunun gideceğinden emindi");
return 0;
}
Here is the program output:
OLD: //İi Iı Ğğ Şş Çç Üü Öö AbCDef
NEW: //iİ ıI ğĞ şŞ çÇ üÜ öÖ aBcdEF
OLD: Quick Brown tilki, LAZY Dogs'un üzerinden atladı
NEW: qUİCK bROWN TİLKİ, lazy dOGS'UN ÜZERİNDEN ATLADI
OLD: Mary küçük bir kuzuya sahipti
NEW: mARY KÜÇÜK BİR KUZUYA SAHİPTİ
OLD: Postu kar kadar beyazdı
NEW: pOSTU KAR KADAR BEYAZDI
OLD: Ve Mary'nin gittiği her yerde
NEW: vE mARY'NİN GİTTİĞİ HER YERDE
OLD: Kuzusunun gideceğinden emindi
NEW: kUZUSUNUN GİDECEĞİNDEN EMİNDİ
Upvotes: 2
Reputation: 144740
The error message indicates that you are not using a single byte encoding for the Turkish letters such as ISO8859-9, Windows code page 1254 or MS/DOS code page 857. You might using the UTF-8 encoding for Unicode code points where non ASCII characters are represented using sequences of 2 to 4 bytes.
Non-ASCII characters cannot be used in character constants, or more precisely should not be used in character constants as they would be parsed as multi character constants, which are error prone and non portable.
To convert case in UTF-8 strings, you should either use wide characters or convert full character strings instead of single characters. Beware that the length of the converted string may be different from the length of the original string: strlen("İ") != strlen("i")
Here is a simplistic implementation:
#include <stdlib.h>
#include <string.h>
static const char * const tcase[] = {
"İ", "I",
"I", "ı",
"ğ", "Ğ",
// ...
};
char *conver_case_turkish(const char *s) {
size_t len = strlen(s);
size_t i = 0, j = 0, k;
size_t ncase = sizeof(tcase) / sizeof(*tcase);
char *dest = malloc(len * 2 + 1);
if (dest) {
while (i < len) {
char c = s[i];
if (c >= 'A' && c <= 'Z' && c != 'I') {
dest[j++] = c - 'A' + 'a';
i++;
} else
if (c >= 'a' && c <= 'z' && c != 'i') {
dest[j++] = c - 'a' + 'A';
i++;
} else {
for (k = 0; k < ncase; k++) {
size_t len1 = strlen(tcase[k]);
size_t len2 = strlen(tcase[k ^ 1]);
if (!strncmp(s + i, tcase[k], len1)) {
strcpy(dest + j, tcase[k ^ 1]);
i += len1;
j += len2;
break;
}
}
if (k == ncase) {
dest[j++] = c;
i++;
}
}
}
dest[j] = '\0';
}
return dest;
}
Upvotes: 2