The .charCodeAt
function returns with the unicode code of the caracter. But I would like to get the byte array instead. I know, if the charcode is over 127, then the character is stored in two or more bytes.
var arr=[];
for(var i=0; i<str.length; i++) {
arr.push(str.charCodeAt(i))
}
This question is related to
javascript
utf-8
function convertByte()
{
var c=document.getElementById("str").value;
var arr = [];
var i=0;
for(var ind=0;ind<c.length;ind++)
{
arr[ind]=c.charCodeAt(i);
i++;
}
document.getElementById("result").innerHTML="The converted value is "+arr.join("");
}
JavaScript String
s are stored in UTF-16. To get UTF-8, you'll have to convert the String
yourself.
One way is to mix encodeURIComponent()
, which will output UTF-8 bytes URL-encoded, with unescape
, as mentioned on ecmanaut.
var utf8 = unescape(encodeURIComponent(str));
var arr = [];
for (var i = 0; i < utf8.length; i++) {
arr.push(utf8.charCodeAt(i));
}
The Google Closure library has functions to convert to/from UTF-8 and byte arrays. If you don't want to use the whole library, you can copy the functions from here. For completeness, the code to convert to a string to a UTF-8 byte array is:
goog.crypt.stringToUtf8ByteArray = function(str) {
// TODO(user): Use native implementations if/when available
var out = [], p = 0;
for (var i = 0; i < str.length; i++) {
var c = str.charCodeAt(i);
if (c < 128) {
out[p++] = c;
} else if (c < 2048) {
out[p++] = (c >> 6) | 192;
out[p++] = (c & 63) | 128;
} else if (
((c & 0xFC00) == 0xD800) && (i + 1) < str.length &&
((str.charCodeAt(i + 1) & 0xFC00) == 0xDC00)) {
// Surrogate Pair
c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF);
out[p++] = (c >> 18) | 240;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else {
out[p++] = (c >> 12) | 224;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
}
}
return out;
};
The TextEncoder
and TextDecoder
Encoding API will let you both encode and decode UTF-8 easily (using typed arrays):
const encoded = new TextEncoder().encode("Ge?a s?? ??sµe");
const decoded = new TextDecoder().decode(encoded);
console.log(encoded, decoded);
Browser support isn't too bad, and there's a polyfill that should work in IE11 and older versions of Edge.
Older versions of TextEncoder
supported other different encodings as a string
constructor parameter argument, but since Firefox 48 and Chrome 53 this is no-longer supported and TextEncoder
and TextDecoder
support only UTF-8.
So this will no-longer work in modern web-browsers used since 2016:
new TextDecoder("shift-jis").decode(new Uint8Array(textbuffer))
As there is no pure byte
type in JavaScript we can represent a byte array as an array of numbers, where each number represents a byte and thus will have an integer value between 0 and 255 inclusive.
Here is a simple function that does convert a JavaScript string into an Array of numbers that contain the UTF-8 encoding of the string:
function toUtf8(str) {
var value = [];
var destIndex = 0;
for (var index = 0; index < str.length; index++) {
var code = str.charCodeAt(index);
if (code <= 0x7F) {
value[destIndex++] = code;
} else if (code <= 0x7FF) {
value[destIndex++] = ((code >> 6 ) & 0x1F) | 0xC0;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0xFFFF) {
value[destIndex++] = ((code >> 12) & 0x0F) | 0xE0;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x1FFFFF) {
value[destIndex++] = ((code >> 18) & 0x07) | 0xF0;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x03FFFFFF) {
value[destIndex++] = ((code >> 24) & 0x03) | 0xF0;
value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else if (code <= 0x7FFFFFFF) {
value[destIndex++] = ((code >> 30) & 0x01) | 0xFC;
value[destIndex++] = ((code >> 24) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
} else {
throw new Error("Unsupported Unicode character \""
+ str.charAt(index) + "\" with code " + code + " (binary: "
+ toBinary(code) + ") at index " + index
+ ". Cannot represent it as UTF-8 byte sequence.");
}
}
return value;
}
function toBinary(byteValue) {
if (byteValue < 0) {
byteValue = byteValue & 0x00FF;
}
var str = byteValue.toString(2);
var len = str.length;
var prefix = "";
for (var i = len; i < 8; i++) {
prefix += "0";
}
return prefix + str;
}
I was using Joni's solution and it worked fine, but this one is much shorter.
This was inspired by the atobUTF16() function of Solution #3 of Mozilla's Base64 Unicode discussion
function convertStringToUTF8ByteArray(str) {
let binaryArray = new Uint8Array(str.length)
Array.prototype.forEach.call(binaryArray, function (el, idx, arr) { arr[idx] = str.charCodeAt(idx) })
return binaryArray
}
Assuming the question is about a DOMString as input and the goal is to get an Array, that when interpreted as string (e.g. written to a file on disk), would be UTF-8 encoded:
Now that nearly all modern browsers support Typed Arrays, it'd be ashamed if this approach is not listed:
.readAsArrayBuffer()
function of a File ReaderExample:
// Create a Blob with an Euro-char (U+20AC)
var b = new Blob(['€']);
var fr = new FileReader();
fr.onload = function() {
ua = new Uint8Array(fr.result);
// This will log "3|226|130|172"
// E2 82 AC
// In UTF-16, it would be only 2 bytes long
console.log(
fr.result.byteLength + '|' +
ua[0] + '|' +
ua[1] + '|' +
ua[2] + ''
);
};
fr.readAsArrayBuffer(b);
Play with that on JSFiddle. I haven't benchmarked this yet but I can imagine this being efficient for large DOMStrings as input.
You can save a string raw as is by using FileReader.
Save the string in a blob and call readAsArrayBuffer(). Then the onload-event results an arraybuffer, which can converted in a Uint8Array. Unfortunately this call is asynchronous.
This little function will help you:
function stringToBytes(str)
{
let reader = new FileReader();
let done = () => {};
reader.onload = event =>
{
done(new Uint8Array(event.target.result), str);
};
reader.readAsArrayBuffer(new Blob([str], { type: "application/octet-stream" }));
return { done: callback => { done = callback; } };
}
Call it like this:
stringToBytes("\u{1f4a9}").done(bytes =>
{
console.log(bytes);
});
output: [240, 159, 146, 169]
explanation:
JavaScript use UTF-16 and surrogate-pairs to store unicode characters in memory. To save unicode character in raw binary byte streams an encoding is necessary. Usually and in the most case, UTF-8 is used for this. If you not use an enconding you can't save unicode character, just ASCII up to 0x7f.
FileReader.readAsArrayBuffer() uses UTF-8.
Source: Stackoverflow.com