I have an ArrayBuffer
which contains a string encoded using UTF-8 and I can't find a standard way of converting such ArrayBuffer
into a JS String
(which I understand is encoded using UTF-16).
I've seen this code in numerous places, but I fail to see how it would work with any UTF-8 code points that are longer than 1 byte.
return String.fromCharCode.apply(null, new Uint8Array(data));
Similarly, I can't find a standard way of converting from a String
to a UTF-8 encoded ArrayBuffer
.
This question is related to
javascript
string
utf-8
arraybuffer
The latest answers to these type of questions (using nowadays methods) is here: Converting between strings and ArrayBuffers
Using TextEncoder and TextDecoder
var uint8array = new TextEncoder("utf-8").encode("Plain Text");
var string = new TextDecoder().decode(uint8array);
console.log(uint8array ,string )
If you are doing this in browser there are no character encoding libraries built-in, but you can get by with:
function pad(n) {
return n.length < 2 ? "0" + n : n;
}
var array = new Uint8Array(data);
var str = "";
for( var i = 0, len = array.length; i < len; ++i ) {
str += ( "%" + pad(array[i].toString(16)))
}
str = decodeURIComponent(str);
Here's a demo that decodes a 3-byte UTF-8 unit: http://jsfiddle.net/Z9pQE/
If you don't want to use any external polyfill library, you can use this function provided by the Mozilla Developer Network website:
function utf8ArrayToString(aBytes) {_x000D_
var sView = "";_x000D_
_x000D_
for (var nPart, nLen = aBytes.length, nIdx = 0; nIdx < nLen; nIdx++) {_x000D_
nPart = aBytes[nIdx];_x000D_
_x000D_
sView += String.fromCharCode(_x000D_
nPart > 251 && nPart < 254 && nIdx + 5 < nLen ? /* six bytes */_x000D_
/* (nPart - 252 << 30) may be not so safe in ECMAScript! So...: */_x000D_
(nPart - 252) * 1073741824 + (aBytes[++nIdx] - 128 << 24) + (aBytes[++nIdx] - 128 << 18) + (aBytes[++nIdx] - 128 << 12) + (aBytes[++nIdx] - 128 << 6) + aBytes[++nIdx] - 128_x000D_
: nPart > 247 && nPart < 252 && nIdx + 4 < nLen ? /* five bytes */_x000D_
(nPart - 248 << 24) + (aBytes[++nIdx] - 128 << 18) + (aBytes[++nIdx] - 128 << 12) + (aBytes[++nIdx] - 128 << 6) + aBytes[++nIdx] - 128_x000D_
: nPart > 239 && nPart < 248 && nIdx + 3 < nLen ? /* four bytes */_x000D_
(nPart - 240 << 18) + (aBytes[++nIdx] - 128 << 12) + (aBytes[++nIdx] - 128 << 6) + aBytes[++nIdx] - 128_x000D_
: nPart > 223 && nPart < 240 && nIdx + 2 < nLen ? /* three bytes */_x000D_
(nPart - 224 << 12) + (aBytes[++nIdx] - 128 << 6) + aBytes[++nIdx] - 128_x000D_
: nPart > 191 && nPart < 224 && nIdx + 1 < nLen ? /* two bytes */_x000D_
(nPart - 192 << 6) + aBytes[++nIdx] - 128_x000D_
: /* nPart < 127 ? */ /* one byte */_x000D_
nPart_x000D_
);_x000D_
}_x000D_
_x000D_
return sView;_x000D_
}_x000D_
_x000D_
let str = utf8ArrayToString([50,72,226,130,130,32,43,32,79,226,130,130,32,226,135,140,32,50,72,226,130,130,79]);_x000D_
_x000D_
// Must show 2H2 + O2 ? 2H2O_x000D_
console.log(str);
_x000D_
The methods readAsArrayBuffer and readAsText from a FileReader object converts a Blob object to an ArrayBuffer or to a DOMString asynchronous.
A Blob object type can be created from a raw text or byte array, for example.
let blob = new Blob([text], { type: "text/plain" });
let reader = new FileReader();
reader.onload = event =>
{
let buffer = event.target.result;
};
reader.readAsArrayBuffer(blob);
I think it's better to pack up this in a promise:
function textToByteArray(text)
{
let blob = new Blob([text], { type: "text/plain" });
let reader = new FileReader();
let done = function() { };
reader.onload = event =>
{
done(new Uint8Array(event.target.result));
};
reader.readAsArrayBuffer(blob);
return { done: function(callback) { done = callback; } }
}
function byteArrayToText(bytes, encoding)
{
let blob = new Blob([bytes], { type: "application/octet-stream" });
let reader = new FileReader();
let done = function() { };
reader.onload = event =>
{
done(event.target.result);
};
if(encoding) { reader.readAsText(blob, encoding); } else { reader.readAsText(blob); }
return { done: function(callback) { done = callback; } }
}
let text = "\uD83D\uDCA9 = \u2661";
textToByteArray(text).done(bytes =>
{
console.log(bytes);
byteArrayToText(bytes, 'UTF-8').done(text =>
{
console.log(text); // = ?
});
});
The main problem of programmers looking for conversion from byte array into a string is UTF-8 encoding (compression) of unicode characters. This code will help you:
var getString = function (strBytes) {
var MAX_SIZE = 0x4000;
var codeUnits = [];
var highSurrogate;
var lowSurrogate;
var index = -1;
var result = '';
while (++index < strBytes.length) {
var codePoint = Number(strBytes[index]);
if (codePoint === (codePoint & 0x7F)) {
} else if (0xF0 === (codePoint & 0xF0)) {
codePoint ^= 0xF0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
} else if (0xE0 === (codePoint & 0xE0)) {
codePoint ^= 0xE0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
} else if (0xC0 === (codePoint & 0xC0)) {
codePoint ^= 0xC0;
codePoint = (codePoint << 6) | (strBytes[++index] ^ 0x80);
}
if (!isFinite(codePoint) || codePoint < 0 || codePoint > 0x10FFFF || Math.floor(codePoint) != codePoint)
throw RangeError('Invalid code point: ' + codePoint);
if (codePoint <= 0xFFFF)
codeUnits.push(codePoint);
else {
codePoint -= 0x10000;
highSurrogate = (codePoint >> 10) | 0xD800;
lowSurrogate = (codePoint % 0x400) | 0xDC00;
codeUnits.push(highSurrogate, lowSurrogate);
}
if (index + 1 == strBytes.length || codeUnits.length > MAX_SIZE) {
result += String.fromCharCode.apply(null, codeUnits);
codeUnits.length = 0;
}
}
return result;
}
All the best !
There's a polyfill for Encoding over on Github: text-encoding. It's easy for Node or the browser, and the Readme advises the following:
var uint8array = TextEncoder(encoding).encode(string);
var string = TextDecoder(encoding).decode(uint8array);
If I recall, 'utf-8'
is the encoding
you need, and of course you'll need to wrap your buffer:
var uint8array = new Uint8Array(utf8buffer);
Hope it works as well for you as it has for me.
This should work:
// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt
/* utf.js - UTF-8 <=> UTF-16 convertion
*
* Copyright (C) 1999 Masanao Izumo <[email protected]>
* Version: 1.0
* LastModified: Dec 25 1999
* This library is free. You can redistribute it and/or modify it.
*/
function Utf8ArrayToStr(array) {
var out, i, len, c;
var char2, char3;
out = "";
len = array.length;
i = 0;
while (i < len) {
c = array[i++];
switch (c >> 4)
{
case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
// 0xxxxxxx
out += String.fromCharCode(c);
break;
case 12: case 13:
// 110x xxxx 10xx xxxx
char2 = array[i++];
out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
break;
case 14:
// 1110 xxxx 10xx xxxx 10xx xxxx
char2 = array[i++];
char3 = array[i++];
out += String.fromCharCode(((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) |
((char3 & 0x3F) << 0));
break;
}
}
return out;
}
It's somewhat cleaner as the other solutions because it doesn't use any hacks nor depends on Browser JS functions, e.g. works also in other JS environments.
Check out the JSFiddle demo.
Source: Stackoverflow.com