Reputation: 23920
How to export data from unidecode
python module for use in another language?
This module converts unicode characters to latin (ascii) characters, roughly preserving phonetic meaning like this:
kožušček => kozuscek
北亰 -> Bei Jing
Москва -> Moskva
This is useful for example for creating URL-s for international web pages. There are ports for another languages, like UnidecodeSharp, but aren't very good quality.
Upvotes: 0
Views: 120
Reputation: 23920
Here is a Python program unidecode_sqlite.py
to export unidecode data to SQLite database, which can be used in every major language:
#!/usr/bin/env python
'''Export unidecode data to SQLite'''
from __future__ import print_function, unicode_literals
import inspect
import os, sys, re
import sqlite3
import unidecode, unicodedata
def unidecode_sqlite(filename):
'''Export unidecode data to filename'''
if os.path.exists(filename):
raise RuntimeError('File exists: %s' % filename)
conn = sqlite3.connect(filename)
conn.execute(
'''create table if not exists unidecode (
c text primary key,
category text not null,
ascii text not null
)'''
)
unidecode_path = os.path.dirname(inspect.getfile(unidecode))
# Python 2 compatibility
if 'unichr' in dir(__builtins__):
unichr_ = unichr
else:
unichr_ = chr
for filename in sorted(os.listdir(unidecode_path)):
if not os.path.isfile(os.path.join(unidecode_path, filename)):
continue
filename_match = re.match(
r'^x([0-9a-f]{3})\.py$',
filename,
re.IGNORECASE
)
if not filename_match:
continue
section = filename_match.group(1)
section_start = int("0x"+section, 0)*0x100
for char_position in range(0x100):
character = unichr_(section_start+char_position)
unidecoded_character = unidecode.unidecode(character)
if unidecoded_character is None or unidecoded_character == '[?]':
continue
conn.execute(
'''insert into unidecode (c, category, ascii)
values (?,?,?)''',
(
character,
unicodedata.category(character),
unidecoded_character
)
)
conn.commit()
conn.execute('vacuum')
if __name__ == "__main__":
if len(sys.argv) != 2:
print('USAGE: %s FILE' % sys.argv[0])
sys.exit(0)
try:
unidecode_sqlite(sys.argv[1])
except (OSError, RuntimeError) as error:
print('ERROR: %s' % error, file=sys.stderr)
sys.exit(1)
This can be used like this on any computer with python (2 or 3, I'm not sure about Windows) and creates 1,3MB file:
virtualenv venv
venv/bin/pip install unidecode
venv/bin/python unidecode_sqlite.py unidecode.sqlite
Upvotes: 0