acura
acura

Reputation: 35

How to convert uppercase/lowercase turkish letters to each other?

I made a function that converts upper case to lower case using C.

#include <stdio.h>
#include <stdlib.h>

//İi Iı Ğğ Şş Çç Üü Öö
char toUpLow(char letter)
{
    if (letter >= 'A' && letter <= 'Z') {
        return letter - 'A' + 'a';
    }
    else if (letter >= 'a' && letter <= 'z') {
        return letter - 'a' + 'A';
    }
    else {
        return -1;
    }
}

int main()
{
    char myChar;
    printf("Enter a character: ");
    scanf("%c", &myChar);
    printf("%c", toUpLow(myChar));

    return 0;
}

I want to add Turkish letters.

char toUpLow(char letter)
{
    if (letter == 'İ') {
        printf("i");
    }
    else if (letter == 'i')
    {
        printf("İ");
    }
    else if (letter == 'I')
    {
        printf("ı");
    }
    else if (letter == 'ı')
    {
        printf("I");
    }
    else if (letter == 'Ğ')
    {
        printf("ğ");
    }
    else if (letter == 'ğ')
    {
        printf("Ğ");
    }
    else if (letter >= 'A' && letter <= 'Z') {
        return letter - 'A' + 'a';
    }
    else if (letter >= 'a' && letter <= 'z') {
        return letter - 'a' + 'A';
    }
    else {
        return -1;
    }
}

I tried to add Turkish letters with if / else and I got this error:

uplowfunction.c:22:24: warning: multi-character character constant [-Wmultichar]

Is there a way to do this?

Upvotes: 3

Views: 301

Answers (2)

Craig Estey
Craig Estey

Reputation: 33601

Turkish "characters" aren't a single byte. They are a UTF-8 sequence of 1-4 bytes. See: https://en.wikipedia.org/wiki/UTF-8

The "size" of a given "character" (i.e. codepoint) can vary. That is, the length of the original character could be different than the length of the translated character. (For example: İ is two bytes but i is one byte).

So, instead of passing char arguments, it's better to pass char * arguments and use partial string comparisons.

So, I've changed your function to accept source and destination strings. And, the strings can be entire words, phrases, sentences, etc.


Here is the refactored code. It is annotated:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if DEBUG
#define dbgprt(_fmt...) \
    printf(_fmt)
#else
#define dbgprt(_fmt...) \
    do { } while (0)
#endif

struct xlat {
    const char *old;
    int olen;

    const char *new;
    int nlen;
};

#define XLAT1(_src,_dst) \
    { .old = _src, .new = _dst }
#define XLAT2(_src,_dst) \
    XLAT1(_src,_dst), \
    XLAT1(_dst,_src)

//İi Iı Ğğ Şş Çç Üü Öö
static struct xlat xlatlist[] = {
    XLAT2("İ","i"),
    XLAT2("I","ı"),
    XLAT2("Ğ","ğ"),
    XLAT2("Ş","ş"),
    XLAT2("Ç","ç"),
    XLAT2("Ü","ü"),
    XLAT2("Ö","ö"),

    XLAT1(NULL,NULL)
};

// utf8len -- get length of next/leftmost UTF8 character
int
utf8len(const char *str)
{
    int len = 1;

    dbgprt("utf8len: ENTER str='%s'\n",str);

    do {
        // ordinary ASCII char
        if ((str[0] & 0x80) == 0)
            break;

        // skip over start character
        ++str;

        for (;  *str != 0;  ++str, ++len) {
            // still [possibly] part of the same multichar or is it ascii?
            if ((*str & 0x80) == 0)
                break;

            // don't include this if it's the start of the _next_ multichar
            if ((*str & 0x40) != 0)
                break;
        }
    } while (0);

    dbgprt("utf8len: EXIT len=%d\n",len);

    return len;
}

// xlatchar -- decide if we match a given translation
int
xlatchar(const struct xlat *xlat,const char *str,int slen)
{
    int match = 0;

    dbgprt("xlatchar: ENTER old=%d/'%s' new=%d/'%s' str='%s' slen=%d\n",
        xlat->olen,xlat->old,xlat->nlen,xlat->new,str,slen);

    do {
        // codepoint lengths must match
        match = (xlat->olen == slen);
        if (! match)
            break;

        // string prefixes must match
        match = (memcmp(xlat->old,str,xlat->olen) == 0);
        if (! match)
            break;
    } while (0);

    dbgprt("xlatchar: EXIT match=%d\n",match);

    return match;
}

void
toUpLow(char *buf,const char *str)
{

    dbgprt("toUpLow: ENTER str='%s'\n",str);

    // calculate string lengths in table (once)
    static int inited = 0;
    if (! inited) {
        inited = 1;
        dbgprt("toUpLow: XLATINIT\n");
        struct xlat *xlat = xlatlist;
        for (;  xlat->old != NULL;  ++xlat) {
            xlat->olen = utf8len(xlat->old);
            xlat->nlen = utf8len(xlat->new);
        }
    }

    char *dst = buf;

    int slen;
    int dlen;
    for (;  *str != 0;  str += slen, dst += dlen) {
        // get length of first [remaining] char
        slen = utf8len(str);
        dlen = slen;

        // look for match in translation table
        int match = 0;
        const struct xlat *xlat = xlatlist;
        for (;  xlat->old != NULL;  ++xlat) {
            // match on current entry?
            match = xlatchar(xlat,str,slen);

            // if yes, copy out the _translated_ character
            if (match) {
                dbgprt("toUpLow: MATCH nlen=%d\n",xlat->nlen);
                strcpy(dst,xlat->new);
                dlen = xlat->nlen;
                break;
            }
        }
        if (match)
            continue;

        // a multichar that is _not_ part of the translation table
        if (slen != 1) {
            memcpy(dst,str,slen);
            continue;
        }

        // do standard toupper/tolower equivalent
        int letter = *str;
        dlen = 1;

        if (letter >= 'A' && letter <= 'Z') {
            dbgprt("toUpLow: UP2LO\n");
            *dst = letter - 'A' + 'a';
            continue;
        }

        if (letter >= 'a' && letter <= 'z') {
            dbgprt("toUpLow: LO2UP\n");
            *dst = letter - 'a' + 'A';
            continue;
        }

        *dst = letter;
    }

    *dst = 0;

    dbgprt("toUpLow: EXIT buf='%s'\n",buf);
}

void
test(const char *str)
{
    char buf[1000];

    printf("\n");

    dbgprt("BEG: %s\n",str);

    toUpLow(buf,str);

    printf("OLD: %s\n",str);
    printf("NEW: %s\n",buf);
}

int
main()
{
    char myChar[20];

    test("//İi Iı Ğğ Şş Çç Üü Öö AbCDef");

    test("Quick Brown tilki, LAZY Dogs'un üzerinden atladı");

    test("Mary küçük bir kuzuya sahipti");
    test("Postu kar kadar beyazdı");
    test("Ve Mary'nin gittiği her yerde");
    test("Kuzusunun gideceğinden emindi");

    return 0;
}

Here is the program output:


OLD: //İi Iı Ğğ Şş Çç Üü Öö AbCDef
NEW: //iİ ıI ğĞ şŞ çÇ üÜ öÖ aBcdEF

OLD: Quick Brown tilki, LAZY Dogs'un üzerinden atladı
NEW: qUİCK bROWN TİLKİ, lazy dOGS'UN ÜZERİNDEN ATLADI

OLD: Mary küçük bir kuzuya sahipti
NEW: mARY KÜÇÜK BİR KUZUYA SAHİPTİ

OLD: Postu kar kadar beyazdı
NEW: pOSTU KAR KADAR BEYAZDI

OLD: Ve Mary'nin gittiği her yerde
NEW: vE mARY'NİN GİTTİĞİ HER YERDE

OLD: Kuzusunun gideceğinden emindi
NEW: kUZUSUNUN GİDECEĞİNDEN EMİNDİ

Upvotes: 2

chqrlie
chqrlie

Reputation: 144740

The error message indicates that you are not using a single byte encoding for the Turkish letters such as ISO8859-9, Windows code page 1254 or MS/DOS code page 857. You might using the UTF-8 encoding for Unicode code points where non ASCII characters are represented using sequences of 2 to 4 bytes.

Non-ASCII characters cannot be used in character constants, or more precisely should not be used in character constants as they would be parsed as multi character constants, which are error prone and non portable.

To convert case in UTF-8 strings, you should either use wide characters or convert full character strings instead of single characters. Beware that the length of the converted string may be different from the length of the original string: strlen("İ") != strlen("i")

Here is a simplistic implementation:

#include <stdlib.h>
#include <string.h>

static const char * const tcase[] = {
    "İ", "I",
    "I", "ı",
    "ğ", "Ğ",
    // ...
};

char *conver_case_turkish(const char *s) {
    size_t len = strlen(s);
    size_t i = 0, j = 0, k;
    size_t ncase = sizeof(tcase) / sizeof(*tcase);
    char *dest = malloc(len * 2 + 1);

    if (dest) {
        while (i < len) {
            char c = s[i];
            if (c >= 'A' && c <= 'Z' && c != 'I') {
                dest[j++] = c - 'A' + 'a';
                i++;
            } else
            if (c >= 'a' && c <= 'z' && c != 'i') {
                dest[j++] = c - 'a' + 'A';
                i++;
            } else {
                for (k = 0; k < ncase; k++) {
                     size_t len1 = strlen(tcase[k]);
                     size_t len2 = strlen(tcase[k ^ 1]);
                     if (!strncmp(s + i, tcase[k], len1)) {
                         strcpy(dest + j, tcase[k ^ 1]);
                         i += len1;
                         j += len2;
                         break;
                     }
                }
                if (k == ncase) {
                    dest[j++] = c;
                    i++;
                }
            }
        }
        dest[j] = '\0';
    }
    return dest;
}

Upvotes: 2

Related Questions