Reputation: 1141
I am creating a Node.js script to retrieve data from a website whose body contains emoji and then insert text (including emoji) into a MySQL database.
However, some emoji do not appear to be encoded correctly which is preventing me form inserting text into the database.
When attempting to insert a string that contained the cat face emoji (🐱) into the MySQL database I received the error ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
which indicates a problem with the encoding. However, the response content-type was charset=utf-8
and my database's encoding is utf8mb4_unicode_ci
so in theory I should be able to insert the full range of unicode characters. If I try to insert 🐱 into the database with phpmyadmin the query completes successfully, indicating the problem is in how my Node.js script retrieves the data.
For example, when trying to retrieve data from the emojipedia page for Cat Face my script (below) does not correctly print the cat face emoji, instead printing �. This only appears to happen with certain emoji. For example, the soccer ball emoji (⚽) is printed by my script just fine. Perhaps this has to do with the length of the code-points as the soccer ball's (U+26BD
) is smaller than cat face's (U+1F431
)?
const request = require('request');
const jsdom = require('jsdom');
const $ = require('jquery')(new jsdom.JSDOM().window);
request({
url: 'https://emojipedia.org/emoji/%F0%9F%90%B1/', // This is the url for the cat face page
encoding: 'utf8'
}, (err, response, htmlString) => {
let html = ParseHTML(htmlString);
let emojiElement = html.find('h1 .emoji');
console.log(emojiElement.text()); // prints: �
console.log(emojiElement.text().length); // prints: 2
database.query('UPDATE `posts` SET `emoji` = ? WHERE `id` = 1', [emojiElement.text()], function (err, results) {
if (err) throw err; // Error: ER_TRUNCATED_WRONG_VALUE_FOR_FIELD
});
});
function parseHTML(htmlString) {
const dom = new jsdom.JSDOM(htmlString);
dom.window.document.querySelectorAll('script').forEach(element => element.remove());
dom.window.document.querySelectorAll('head').forEach(element => element.remove());
dom.window.document.querySelectorAll('link').forEach(element => element.remove());
dom.window.document.querySelectorAll('style').forEach(element => element.remove());
dom.window.document.querySelectorAll('iframe').forEach(element => element.remove());
dom.window.document.querySelectorAll('noscript').forEach((element) => {
let replacement = dom.window.document.createElement('div');
replacement.setAttribute('class', 'noscript');
replacement.innerHTML = element.innerHTML;
element.parentNode.replaceChild(replacement, element);
});
dom.window.document.querySelectorAll('img[src]').forEach((element) => {
const src = element.getAttribute('src');
element.setAttribute('data-src', src);
element.removeAttribute('src');
});
dom.window.document.querySelectorAll('[style]').forEach((element) => {
element.removeAttribute('style');
});
return $(dom.window.document.documentElement.innerHTML);
}
How can I correctly retrieve emoji with request in Node.js so they can be inserted into a MySQL database?
Upvotes: 1
Views: 1080
Reputation: 1350
You must initialize your database with the option charset: 'utf8mb4'
.
Additionally the field holding the string containing emoji characters must also use that encoding.
Please provide an example of your DB setup and table schema for additional context, or consider this example:
const mysql = require('mysql');
const connection = mysql.createPool({
connectionLimit: 10,
host: process.env.DB_HOST || '127.0.0.1',
user: process.env.DB_USER || 'local_user',
password: process.env.DB_PASSWORD || 'local_password',
database: process.env.DB_NAME || 'local_database',
multipleStatements: true,
charset: 'utf8mb4' // necessary if you might need support for emoji characters
});
https://github.com/dusthaines/mysqljs_setup_snippet/blob/master/app.js
Upvotes: 2