Reputation: 9012
I have some strings in utf-8 format and they have to be converted to uppercase (and / or vice versa). For the standard ASCII characters this is easy as C++ provides functions for this but for non-ASCII characters (like Cyrillic, Greek, ...) this is a hard problem. I found the ICU library (see https://unicode-org.github.io, especially https://unicode-org.github.io, https://unicode-org.github.io/icu/userguide/, https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/) and an example (https://www.delftstack.com/howto/cpp/how-to-convert-string-to-uppercase-cpp/).
From this I constructed an example:
#include <iostream>
#include <string>
#include <algorithm>
#include <unicode/unistr.h>
#include <unicode/ustream.h>
#include <unicode/locid.h>
using std::cout; using std::string;
using std::endl; using std::cin;
using std::transform;
using std::toupper;
int main() {
string string0("hello there είναι απλά ένα κείμενο χωρίς");
string string1("hallo Привет");
string string2("Hallo Привет");
string string3("HALLO ПРИВЕТ");
icu::UnicodeString unicodeString0(string0.c_str());
cout << "input string: " << string0 << endl
<< "output string: " << unicodeString0.toUpper() << endl;
icu::UnicodeString unicodeString1(string1.c_str());
cout << "input string: " << string1 << endl
<< "output string: " << unicodeString1.toUpper() << endl;
icu::UnicodeString unicodeString2(string2.c_str());
cout << "input string: " << string2 << endl
<< "output string: " << unicodeString2.toUpper() << endl;
icu::UnicodeString unicodeString3(string3.c_str());
cout << "input string: " << string3 << endl
<< "output string: " << unicodeString3.toUpper() << endl;
string string4 = "Contrairement à une opinion répandue";
icu::UnicodeString unicodeString4(string4.c_str());
cout << "input string: " << string4 << endl
<< "output string: " << unicodeString4.toUpper() << endl
<< "output string: " << unicodeString4.toUpper("fr-FR") << endl;
return 0;
}
Compiled it (gcc9.3.0 ICU library 67.1):
g++ s2.cpp -licuio -licuuc -o s2
and when running I get:
input string: hello there είναι απλά ένα κείμενο χωρίς
output string: HELLO THERE
input string: hallo Привет
output string: HALLO
input string: Hallo Привет
output string: HALLO
input string: HALLO ПРИВЕТ
output string: HALLO
input string: Contrairement à une opinion répandue
output string: CONTRAIREMENT UNE OPINION RPANDUE
output string: CONTRAIREMENT UNE OPINION RPANDUE
So we see all the special characters are not shown in the output, I must have missed something (trivial?), though I don't see it. I'm not limited to the ICU library so other solutions for conversion in C++ between lowercase and uppercase are also welcome.
Any suggestions?
Upvotes: 0
Views: 1486
Reputation: 3859
Check/Test my code https://repl.it/@JomaCorpFX/ToUpperToLower#main.cpp
For Windows with correct drawing of glyphs/unicode chars(ex. "🐶"), I recomend run your program with the new Windows Terminal.
Read this about Windows's console
Windows Command-Line: Unicode and UTF-8 Output Text Buffer
UPDATE: Added CYGWIN's code and some fixes.
Code
#include <iostream>
#include <set>
#include <string>
#include <locale>
// WINDOWS
#if (_WIN32)
#include <Windows.h>
#include <conio.h>
#define WINDOWS_PLATFORM 1
#define DLLCALL STDCALL
#define DLLIMPORT _declspec(dllimport)
#define DLLEXPORT _declspec(dllexport)
#define DLLPRIVATE
#define NOMINMAX
#elif __CYGWIN__
#define CYGWIN_PLATFORM 1
#include <windows.h>
#include <unistd.h>
#include <termios.h>
#define DLLCALL CDECL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#define CoTaskMemAlloc(p) malloc(p)
#define CoTaskMemFree(p) free(p)
//EMSCRIPTEN
#elif defined(__EMSCRIPTEN__)
#include <emscripten/emscripten.h>
#include <emscripten/bind.h>
#include <unistd.h>
#include <termios.h>
#define EMSCRIPTEN_PLATFORM 1
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
// LINUX - Ubuntu, Fedora, , Centos, Debian, RedHat
#elif (__LINUX__ || __gnu_linux__ || __linux__ || __linux || linux)
#define LINUX_PLATFORM 1
#include <unistd.h>
#include <termios.h>
#define DLLCALL CDECL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#define CoTaskMemAlloc(p) malloc(p)
#define CoTaskMemFree(p) free(p)
//ANDROID
#elif (__ANDROID__ || ANDROID)
#define ANDROID_PLATFORM 1
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
//MACOS
#elif defined(__APPLE__)
#include <unistd.h>
#include <termios.h>
#define DLLCALL
#define DLLIMPORT
#define DLLEXPORT __attribute__((visibility("default")))
#define DLLPRIVATE __attribute__((visibility("hidden")))
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE && TARGET_IPHONE_SIMULATOR
#define IOS_SIMULATOR_PLATFORM 1
#elif TARGET_OS_IPHONE
#define IOS_PLATFORM 1
#elif TARGET_OS_MAC
#define MACOS_PLATFORM 1
#else
#endif
#endif
typedef std::string String;
typedef std::wstring WString;
#define EMPTY_STRING u8""s
#define EMPTY_WSTRING L""s
using namespace std::literals::string_literals;
class Strings
{
public:
static String WideStringToString(const WString &wstr)
{
if (wstr.empty())
{
return String();
}
size_t pos;
size_t begin = 0;
String ret;
#if WINDOWS_PLATFORM
int size;
pos = wstr.find(static_cast<wchar_t>(0), begin);
while (pos != WString::npos && begin < wstr.length())
{
WString segment = WString(&wstr[begin], pos - begin);
size = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, &segment[0], segment.size(), NULL, 0, NULL, NULL);
String converted = String(size, 0);
WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, &segment[0], segment.size(), &converted[0], converted.size(), NULL, NULL);
ret.append(converted);
ret.append({0});
begin = pos + 1;
pos = wstr.find(static_cast<wchar_t>(0), begin);
}
if (begin <= wstr.length())
{
WString segment = WString(&wstr[begin], wstr.length() - begin);
size = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, &segment[0], segment.size(), NULL, 0, NULL, NULL);
String converted = String(size, 0);
WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, &segment[0], segment.size(), &converted[0], converted.size(), NULL, NULL);
ret.append(converted);
}
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
size_t size;
pos = wstr.find(static_cast<wchar_t>(0), begin);
while (pos != WString::npos && begin < wstr.length())
{
WString segment = WString(&wstr[begin], pos - begin);
size = wcstombs(nullptr, segment.c_str(), 0);
String converted = String(size, 0);
wcstombs(&converted[0], segment.c_str(), converted.size());
ret.append(converted);
ret.append({0});
begin = pos + 1;
pos = wstr.find(static_cast<wchar_t>(0), begin);
}
if (begin <= wstr.length())
{
WString segment = WString(&wstr[begin], wstr.length() - begin);
size = wcstombs(nullptr, segment.c_str(), 0);
String converted = String(size, 0);
wcstombs(&converted[0], segment.c_str(), converted.size());
ret.append(converted);
}
#else
static_assert(false, "Unknown Platform");
#endif
return ret;
}
static WString StringToWideString(const String &str)
{
if (str.empty())
{
return WString();
}
size_t pos;
size_t begin = 0;
WString ret;
#if WINDOWS_PLATFORM
int size = 0;
pos = str.find(static_cast<char>(0), begin);
while (pos != std::string::npos)
{
std::string segment = std::string(&str[begin], pos - begin);
std::wstring converted = std::wstring(segment.size() + 1, 0);
size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, &segment[0], segment.size(), &converted[0], converted.length());
converted.resize(size);
ret.append(converted);
ret.append({0});
begin = pos + 1;
pos = str.find(static_cast<char>(0), begin);
}
if (begin < str.length())
{
std::string segment = std::string(&str[begin], str.length() - begin);
std::wstring converted = std::wstring(segment.size() + 1, 0);
size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, segment.c_str(), segment.size(), &converted[0], converted.length());
converted.resize(size);
ret.append(converted);
}
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
size_t size;
pos = str.find(static_cast<char>(0), begin);
while (pos != String::npos)
{
String segment = String(&str[begin], pos - begin);
WString converted = WString(segment.size(), 0);
size = mbstowcs(&converted[0], &segment[0], converted.size());
converted.resize(size);
ret.append(converted);
ret.append({0});
begin = pos + 1;
pos = str.find(static_cast<char>(0), begin);
}
if (begin < str.length())
{
String segment = String(&str[begin], str.length() - begin);
WString converted = WString(segment.size(), 0);
size = mbstowcs(&converted[0], &segment[0], converted.size());
converted.resize(size);
ret.append(converted);
}
#else
static_assert(false, "Unknown Platform");
#endif
return ret;
}
static WString ToUpper(const WString &data)
{
WString result = data;
auto &f = std::use_facet<std::ctype<wchar_t>>(std::locale());
f.toupper(&result[0], &result[0] + result.size());
return result;
}
static String ToUpper(const String &data)
{
return WideStringToString(ToUpper(StringToWideString(data)));
}
static WString ToLower(const WString &data)
{
WString result = data;
auto &f = std::use_facet<std::ctype<wchar_t>>(std::locale());
f.tolower(&result[0], &result[0] + result.size());
return result;
}
static String ToLower(const String &data)
{
return WideStringToString(ToLower(StringToWideString(data)));
}
};
enum class ConsoleTextStyle
{
DEFAULT = 0,
BOLD = 1,
FAINT = 2,
ITALIC = 3,
UNDERLINE = 4,
SLOW_BLINK = 5,
RAPID_BLINK = 6,
REVERSE = 7,
};
enum class ConsoleForeground
{
DEFAULT = 39,
BLACK = 30,
DARK_RED = 31,
DARK_GREEN = 32,
DARK_YELLOW = 33,
DARK_BLUE = 34,
DARK_MAGENTA = 35,
DARK_CYAN = 36,
GRAY = 37,
DARK_GRAY = 90,
RED = 91,
GREEN = 92,
YELLOW = 93,
BLUE = 94,
MAGENTA = 95,
CYAN = 96,
WHITE = 97
};
enum class ConsoleBackground
{
DEFAULT = 49,
BLACK = 40,
DARK_RED = 41,
DARK_GREEN = 42,
DARK_YELLOW = 43,
DARK_BLUE = 44,
DARK_MAGENTA = 45,
DARK_CYAN = 46,
GRAY = 47,
DARK_GRAY = 100,
RED = 101,
GREEN = 102,
YELLOW = 103,
BLUE = 104,
MAGENTA = 105,
CYAN = 106,
WHITE = 107
};
class Console
{
private:
static void EnableVirtualTermimalProcessing()
{
#if WINDOWS_PLATFORM
HANDLE hOut = GetStdHandle(STD_OUTPUT_HANDLE);
DWORD dwMode = 0;
GetConsoleMode(hOut, &dwMode);
if (!(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING))
{
dwMode |= ENABLE_VIRTUAL_TERMINAL_PROCESSING;
SetConsoleMode(hOut, dwMode);
}
#endif
}
static void ResetTerminalFormat()
{
std::cout << u8"\033[0m";
}
static void SetVirtualTerminalFormat(ConsoleForeground foreground, ConsoleBackground background, std::set<ConsoleTextStyle> styles)
{
String format = u8"\033[";
format.append(std::to_string(static_cast<int>(foreground)));
format.append(u8";");
format.append(std::to_string(static_cast<int>(background)));
if (styles.size() > 0)
{
for (auto it = styles.begin(); it != styles.end(); ++it)
{
format.append(u8";");
format.append(std::to_string(static_cast<int>(*it)));
}
}
format.append(u8"m");
std::cout << format;
}
public:
static void Clear()
{
#if WINDOWS_PLATFORM
std::system(u8"cls");
#elif LINUX_PLATFORM || defined MACOS_PLATFORM || CYGWIN_PLATFORM
std::system(u8"clear");
#elif EMSCRIPTEN_PLATFORM
emscripten::val::global()["console"].call<void>(u8"clear");
#else
static_assert(false, "Unknown Platform");
#endif
}
static void Write(const String &s, ConsoleForeground foreground = ConsoleForeground::DEFAULT, ConsoleBackground background = ConsoleBackground::DEFAULT, std::set<ConsoleTextStyle> styles = {})
{
#ifndef EMSCRIPTEN_PLATFORM
EnableVirtualTermimalProcessing();
SetVirtualTerminalFormat(foreground, background, styles);
#endif
String str = s;
#if WINDOWS_PLATFORM
WString unicode = Strings::StringToWideString(str);
WriteConsole(GetStdHandle(STD_OUTPUT_HANDLE), unicode.c_str(), static_cast<DWORD>(unicode.length()), nullptr, nullptr);
#elif defined LINUX_PLATFORM || defined MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
std::cout << str;
#else
static_assert(false, "Unknown Platform");
#endif
#ifndef EMSCRIPTEN_PLATFORM
ResetTerminalFormat();
#endif
}
static void WriteLine(const String &s, ConsoleForeground foreground = ConsoleForeground::DEFAULT, ConsoleBackground background = ConsoleBackground::DEFAULT, std::set<ConsoleTextStyle> styles = {})
{
Write(s, foreground, background, styles);
std::cout << std::endl;
}
static void Write(const WString &s, ConsoleForeground foreground = ConsoleForeground::DEFAULT, ConsoleBackground background = ConsoleBackground::DEFAULT, std::set<ConsoleTextStyle> styles = {})
{
#ifndef EMSCRIPTEN_PLATFORM
EnableVirtualTermimalProcessing();
SetVirtualTerminalFormat(foreground, background, styles);
#endif
WString str = s;
#if WINDOWS_PLATFORM
WriteConsole(GetStdHandle(STD_OUTPUT_HANDLE), str.c_str(), static_cast<DWORD>(str.length()), nullptr, nullptr);
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
std::cout << Strings::WideStringToString(str);
#else
static_assert(false, "Unknown Platform");
#endif
#ifndef EMSCRIPTEN_PLATFORM
ResetTerminalFormat();
#endif
}
static void WriteLine(const WString &s, ConsoleForeground foreground = ConsoleForeground::DEFAULT, ConsoleBackground background = ConsoleBackground::DEFAULT, std::set<ConsoleTextStyle> styles = {})
{
Write(s, foreground, background, styles);
std::cout << std::endl;
}
static void WriteLine()
{
std::cout << std::endl;
}
static void Pause()
{
char c;
do
{
c = getchar();
std::cout << "Press Key " << std::endl;
} while (c != 64);
std::cout << "KeyPressed" << std::endl;
}
static int PauseAny(bool printWhenPressed = false, ConsoleForeground foreground = ConsoleForeground::DEFAULT, ConsoleBackground background = ConsoleBackground::DEFAULT, std::set<ConsoleTextStyle> styles = {})
{
int ch;
#ifdef WINDOWS_PLATFORM
ch = _getch();
#elif LINUX_PLATFORM || MACOS_PLATFORM || EMSCRIPTEN_PLATFORM || CYGWIN_PLATFORM
struct termios oldt, newt;
tcgetattr(STDIN_FILENO, &oldt);
newt = oldt;
newt.c_lflag &= ~(ICANON | ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &newt);
ch = getchar();
tcsetattr(STDIN_FILENO, TCSANOW, &oldt);
#else
static_assert(false, "Unknown Platform");
#endif
if (printWhenPressed)
{
Console::Write(String(1, ch), foreground, background, styles);
}
return ch;
}
};
int main()
{
#if CYGWIN_PLATFORM
/*
using c++ std::locale::global(). It generates an error.
terminate called after throwing an instance of 'std::runtime_error'
what(): locale::facet::_S_create_c_locale name not valid
it need to be fixed.
*/
std::setlocale(LC_ALL, u8"en_US.UTF8"); //Calling clasic C locale function all OK. Needed for Console::WriteLine on Linux. If not present throws an error and if not is a unicode locale throws an error. It need to be fixed.
#else
std::locale::global(std::locale(u8"en_US.UTF8")); //Required for Linux. Error when run without set unicode locale. This need to be investigated and fixed.
#endif
String dataStr = u8"Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë";
WString dataWStr = L"Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë";
Console::WriteLine(dataStr);
Console::WriteLine(dataWStr);
dataStr = Strings::ToUpper(dataStr);
dataWStr = Strings::ToUpper(dataWStr);
Console::WriteLine(dataStr);
Console::WriteLine(dataWStr);
dataStr = Strings::ToLower(dataStr);
dataWStr = Strings::ToLower(dataWStr);
Console::WriteLine(dataStr);
Console::WriteLine(dataWStr);
//Another examples
WString string0(L"hello there είναι απλά ένα κείμενο χωρίς");
WString string1(L"hallo Привет");
WString string2(L"Hallo Привет");
WString string3(L"HALLO ПРИВЕТ");
WString string4 = L"Contrairement à une opinion répandue 🐶";
Console::WriteLine(u8"█ Original");
Console::WriteLine(string0);
Console::WriteLine(string1);
Console::WriteLine(string2);
Console::WriteLine(string3);
Console::WriteLine(string4);
Console::WriteLine(u8"█ ToUpper");
string0 = Strings::ToUpper(string0);
string1 = Strings::ToUpper(string1);
string2 = Strings::ToUpper(string2);
string3 = Strings::ToUpper(string3);
string4 = Strings::ToUpper(string4);
Console::WriteLine(string0);
Console::WriteLine(string1);
Console::WriteLine(string2);
Console::WriteLine(string3);
Console::WriteLine(string4);
Console::WriteLine(u8"█ ToLower");
string0 = Strings::ToLower(string0);
string1 = Strings::ToLower(string1);
string2 = Strings::ToLower(string2);
string3 = Strings::ToLower(string3);
string4 = Strings::ToLower(string4);
Console::WriteLine(string0);
Console::WriteLine(string1);
Console::WriteLine(string2);
Console::WriteLine(string3);
Console::WriteLine(string4);
Console::WriteLine(u8"Press any key to exit"s, ConsoleForeground::DARK_GRAY);
Console::PauseAny();
return 0;
}
Output
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
Zoë Saldaña played in La maldición del padre Cardona. ëèñ αω óóChloë
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
ZOË SALDAÑA PLAYED IN LA MALDICIÓN DEL PADRE CARDONA. ËÈÑ ΑΩ ÓÓCHLOË
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
zoë saldaña played in la maldición del padre cardona. ëèñ αω óóchloë
█ Original
hello there είναι απλά ένα κείμενο χωρίς
hallo Привет
Hallo Привет
HALLO ПРИВЕТ
Contrairement à une opinion répandue 🐶
█ ToUpper
HELLO THERE ΕΊΝΑΙ ΑΠΛΆ ΈΝΑ ΚΕΊΜΕΝΟ ΧΩΡΊΣ
HALLO ПРИВЕТ
HALLO ПРИВЕТ
HALLO ПРИВЕТ
CONTRAIREMENT À UNE OPINION RÉPANDUE 🐶
█ ToLower
hello there είναι απλά ένα κείμενο χωρίσ
hallo привет
hallo привет
hallo привет
contrairement à une opinion répandue 🐶
Press any key to exit
Compiled with Visual C++ compiler - Powershell and CMD's output using Windows Terminal - Correct drawing ✔️
Compiled with Visual C++ compiler - pure cmd's output - Partial drawing ❗
Compiled with Visual C++ compiler - pure PowerShell's output - Partial drawing ❗
Compiled with Clang++ in WSL Ubuntu 20.04 - Visual Studio Code's output - Correct drawing ✔️
Compiled with g++(Cygwin) - Visual Studio code's Output + CygwinTerminal's output + Windows Terminal's output - Correct drawing ✔️ different glyps rendering
Compiled with Visual C++ compiler - Visual Studio 2019's output - Correct drawing ✔️
Upvotes: 2
Reputation: 11271
edit: I just read that you should not use wchar_t
for unicode, so this answer is not complete.
See my duplicate tag
#include <iostream>
#include <string>
#include <algorithm>
using std::wcout;
using std::wstring;
#define endl '\n'
int main() {
std::locale::global(std::locale("en_US.UTF8"));
std::wcout.imbue(std::locale());
auto& f = std::use_facet<std::ctype<wchar_t>>(std::locale());
wstring string0(L"hello there είναι απλά ένα κείμενο χωρίς");
wstring string1(L"hallo Привет");
wstring string2(L"Hallo Привет");
wstring string3(L"HALLO ПРИВЕТ");
wstring output0 = string0;
f.toupper(&output0[0], &output0[output0.size()]);
wcout << "input string: " << string0 << endl
<< "output string: " << output0 << endl;
wstring output1 = string1;
f.toupper(&output1[0], &output1[output1.size()]);
wcout << "input string: " << string1 << endl
<< "output string: " << output1 << endl;
wstring output2 = string2;
f.toupper(&output2[0], &output2[output2.size()]);
wcout << "input string: " << string2 << endl
<< "output string: " << output2 << endl;
wstring output3 = string3;
f.toupper(&output3[0], &output3[output3.size()]);
wcout << "input string: " << string3 << endl
<< "output string: " << output3 << endl;
wstring string4 = L"Contrairement à une opinion répandue";
wstring output4 = string4;
f.toupper(&output4[0], &output4[output4.size()]);
wcout << "input string: " << string4 << endl
<< "output string: " << output4 << endl;
}
returns
input string: hello there είναι απλά ένα κείμενο χωρίς
output string: HELLO THERE ΕΊΝΑΙ ΑΠΛΆ ΈΝΑ ΚΕΊΜΕΝΟ ΧΩΡΊΣ
input string: hallo Привет
output string: HALLO ПРИВЕТ
input string: Hallo Привет
output string: HALLO ПРИВЕТ
input string: HALLO ПРИВЕТ
output string: HALLO ПРИВЕТ
input string: Contrairement à une opinion répandue
output string: CONTRAIREMENT À UNE OPINION RÉPANDUE
Upvotes: 0