[javascript] How to convert UTF8 string to byte array?

The .charCodeAt function returns with the unicode code of the caracter. But I would like to get the byte array instead. I know, if the charcode is over 127, then the character is stored in two or more bytes.

var arr=[];
for(var i=0; i<str.length; i++) {
    arr.push(str.charCodeAt(i))
}

This question is related to javascript utf-8

The answer is


function convertByte()
{
    var c=document.getElementById("str").value;
    var arr = [];
    var i=0;
    for(var ind=0;ind<c.length;ind++)
    {
        arr[ind]=c.charCodeAt(i);
        i++;
    }    
    document.getElementById("result").innerHTML="The converted value is "+arr.join("");    
}

JavaScript Strings are stored in UTF-16. To get UTF-8, you'll have to convert the String yourself.

One way is to mix encodeURIComponent(), which will output UTF-8 bytes URL-encoded, with unescape, as mentioned on ecmanaut.

var utf8 = unescape(encodeURIComponent(str));

var arr = [];
for (var i = 0; i < utf8.length; i++) {
    arr.push(utf8.charCodeAt(i));
}

The Google Closure library has functions to convert to/from UTF-8 and byte arrays. If you don't want to use the whole library, you can copy the functions from here. For completeness, the code to convert to a string to a UTF-8 byte array is:

goog.crypt.stringToUtf8ByteArray = function(str) {
  // TODO(user): Use native implementations if/when available
  var out = [], p = 0;
  for (var i = 0; i < str.length; i++) {
    var c = str.charCodeAt(i);
    if (c < 128) {
      out[p++] = c;
    } else if (c < 2048) {
      out[p++] = (c >> 6) | 192;
      out[p++] = (c & 63) | 128;
    } else if (
        ((c & 0xFC00) == 0xD800) && (i + 1) < str.length &&
        ((str.charCodeAt(i + 1) & 0xFC00) == 0xDC00)) {
      // Surrogate Pair
      c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF);
      out[p++] = (c >> 18) | 240;
      out[p++] = ((c >> 12) & 63) | 128;
      out[p++] = ((c >> 6) & 63) | 128;
      out[p++] = (c & 63) | 128;
    } else {
      out[p++] = (c >> 12) | 224;
      out[p++] = ((c >> 6) & 63) | 128;
      out[p++] = (c & 63) | 128;
    }
  }
  return out;
};

The TextEncoder and TextDecoder Encoding API will let you both encode and decode UTF-8 easily (using typed arrays):

const encoded = new TextEncoder().encode("Ge?a s?? ??sµe");
const decoded = new TextDecoder().decode(encoded);

console.log(encoded, decoded);

Browser support isn't too bad, and there's a polyfill that should work in IE11 and older versions of Edge.


Older versions of TextEncoder supported other different encodings as a string constructor parameter argument, but since Firefox 48 and Chrome 53 this is no-longer supported and TextEncoder and TextDecoder support only UTF-8.

So this will no-longer work in modern web-browsers used since 2016:

new TextDecoder("shift-jis").decode(new Uint8Array(textbuffer))

As there is no pure byte type in JavaScript we can represent a byte array as an array of numbers, where each number represents a byte and thus will have an integer value between 0 and 255 inclusive.

Here is a simple function that does convert a JavaScript string into an Array of numbers that contain the UTF-8 encoding of the string:

function toUtf8(str) {
    var value = [];
    var destIndex = 0;
    for (var index = 0; index < str.length; index++) {
        var code = str.charCodeAt(index);
        if (code <= 0x7F) {
            value[destIndex++] = code;
        } else if (code <= 0x7FF) {
            value[destIndex++] = ((code >> 6 ) & 0x1F) | 0xC0;
            value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
        } else if (code <= 0xFFFF) {
            value[destIndex++] = ((code >> 12) & 0x0F) | 0xE0;
            value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
        } else if (code <= 0x1FFFFF) {
            value[destIndex++] = ((code >> 18) & 0x07) | 0xF0;
            value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
        } else if (code <= 0x03FFFFFF) {
            value[destIndex++] = ((code >> 24) & 0x03) | 0xF0;
            value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
        } else if (code <= 0x7FFFFFFF) {
            value[destIndex++] = ((code >> 30) & 0x01) | 0xFC;
            value[destIndex++] = ((code >> 24) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 18) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 12) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 6 ) & 0x3F) | 0x80;
            value[destIndex++] = ((code >> 0 ) & 0x3F) | 0x80;
        } else {
            throw new Error("Unsupported Unicode character \"" 
                + str.charAt(index) + "\" with code " + code + " (binary: " 
                + toBinary(code) + ") at index " + index
                + ". Cannot represent it as UTF-8 byte sequence.");
        }
    }
    return value;
}

function toBinary(byteValue) {
    if (byteValue < 0) {
        byteValue = byteValue & 0x00FF;
    }
    var str = byteValue.toString(2);
    var len = str.length;
    var prefix = "";
    for (var i = len; i < 8; i++) {
        prefix += "0";
    }
    return prefix + str;
}

I was using Joni's solution and it worked fine, but this one is much shorter.

This was inspired by the atobUTF16() function of Solution #3 of Mozilla's Base64 Unicode discussion

function convertStringToUTF8ByteArray(str) {
    let binaryArray = new Uint8Array(str.length)
    Array.prototype.forEach.call(binaryArray, function (el, idx, arr) { arr[idx] = str.charCodeAt(idx) })
    return binaryArray
}

Assuming the question is about a DOMString as input and the goal is to get an Array, that when interpreted as string (e.g. written to a file on disk), would be UTF-8 encoded:

Now that nearly all modern browsers support Typed Arrays, it'd be ashamed if this approach is not listed:

  • According to the W3C, software supporting the File API should accept DOMStrings in their Blob constructor (see also: String encoding when constructing a Blob)
  • Blobs can be converted to an ArrayBuffer using the .readAsArrayBuffer() function of a File Reader
  • Using a DataView or constructing a Typed Array with the buffer read by the File Reader, one can access every single byte of the ArrayBuffer

Example:

// Create a Blob with an Euro-char (U+20AC)
var b = new Blob(['€']);
var fr = new FileReader();

fr.onload = function() {
    ua = new Uint8Array(fr.result);
    // This will log "3|226|130|172"
    //                  E2  82  AC
    // In UTF-16, it would be only 2 bytes long
    console.log(
        fr.result.byteLength + '|' + 
        ua[0]  + '|' + 
        ua[1] + '|' + 
        ua[2] + ''
    );
};
fr.readAsArrayBuffer(b);

Play with that on JSFiddle. I haven't benchmarked this yet but I can imagine this being efficient for large DOMStrings as input.


You can save a string raw as is by using FileReader.

Save the string in a blob and call readAsArrayBuffer(). Then the onload-event results an arraybuffer, which can converted in a Uint8Array. Unfortunately this call is asynchronous.

This little function will help you:

function stringToBytes(str)
{
    let reader = new FileReader();
    let done = () => {};

    reader.onload = event =>
    {
        done(new Uint8Array(event.target.result), str);
    };
    reader.readAsArrayBuffer(new Blob([str], { type: "application/octet-stream" }));

    return { done: callback => { done = callback; } };
}

Call it like this:

stringToBytes("\u{1f4a9}").done(bytes =>
{
    console.log(bytes);
});

output: [240, 159, 146, 169]

explanation:

JavaScript use UTF-16 and surrogate-pairs to store unicode characters in memory. To save unicode character in raw binary byte streams an encoding is necessary. Usually and in the most case, UTF-8 is used for this. If you not use an enconding you can't save unicode character, just ASCII up to 0x7f.

FileReader.readAsArrayBuffer() uses UTF-8.