[javascript] Using Javascript's atob to decode base64 doesn't properly decode utf-8 strings

I would assume that one might want a solution that produces a widely useable base64 URI. Please visit data:text/plain;charset=utf-8;base64,4pi44pi54pi64pi74pi84pi+4pi/ to see a demonstration (copy the data uri, open a new tab, paste the data URI into the address bar, then press enter to go to the page). Despite the fact that this URI is base64-encoded, the browser is still able to recognize the high code points and decode them properly. The minified encoder+decoder is 1058 bytes (+Gzip?589 bytes)

!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)

Below is the source code used to generate it.

var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
    return function(inputString, BOMit){
        return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
            /[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
        ));
    }
})(btoa, function(nonAsciiChars){"use strict";
    // make the UTF string into a binary UTF-8 encoded string
    var point = nonAsciiChars.charCodeAt(0);
    if (point >= 0xD800 && point <= 0xDBFF) {
        var nextcode = nonAsciiChars.charCodeAt(1);
        if (nextcode !== nextcode) // NaN because string is 1 code point long
            return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
        // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
        if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
            point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
            if (point > 0xffff)
                return fromCharCode(
                    (0x1e/*0b11110*/<<3) | (point>>>18),
                    (0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
                    (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
                    (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
                );
        } else return fromCharCode(0xef, 0xbf, 0xbd);
    }
    if (point <= 0x007f) return nonAsciiChars;
    else if (point <= 0x07ff) {
        return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
    } else return fromCharCode(
        (0xe/*0b1110*/<<4) | (point>>>12),
        (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
        (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
    );
});

Then, to decode the base64 data, either HTTP get the data as a data URI or use the function below.

var clz32 = Math.clz32 || (function(log, LN2){"use strict";
    return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
    return function(inputString, keepBOM){
        inputString = atob(inputString);
        if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
            inputString = inputString.substring(3); // eradicate UTF-8 BOM
        // 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
        // 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
        return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
    }
})(atob, function(encoded){"use strict";
    var codePoint = encoded.charCodeAt(0) << 24;
    var leadingOnes = clz32(~codePoint);
    var endPos = 0, stringLen = encoded.length;
    var result = "";
    if (leadingOnes < 5 && stringLen >= leadingOnes) {
        codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
        for (endPos = 1; endPos < leadingOnes; ++endPos)
            codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
        if (codePoint <= 0xFFFF) { // BMP code point
          result += fromCharCode(codePoint);
        } else if (codePoint <= 0x10FFFF) {
          // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
          codePoint -= 0x10000;
          result += fromCharCode(
            (codePoint >> 10) + 0xD800,  // highSurrogate
            (codePoint & 0x3ff) + 0xDC00 // lowSurrogate
          );
        } else endPos = 0; // to fill it in with INVALIDs
    }
    for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
    return result;
});

The advantage of being more standard is that this encoder and this decoder are more widely applicable because they can be used as a valid URL that displays correctly. Observe.

_x000D_
_x000D_
(function(window){_x000D_
    "use strict";_x000D_
    var sourceEle = document.getElementById("source");_x000D_
    var urlBarEle = document.getElementById("urlBar");_x000D_
    var mainFrameEle = document.getElementById("mainframe");_x000D_
    var gotoButton = document.getElementById("gotoButton");_x000D_
    var parseInt = window.parseInt;_x000D_
    var fromCodePoint = String.fromCodePoint;_x000D_
    var parse = JSON.parse;_x000D_
    _x000D_
    function unescape(str){_x000D_
        return str.replace(/\\u[\da-f]{0,4}|\\x[\da-f]{0,2}|\\u{[^}]*}|\\[bfnrtv"'\\]|\\0[0-7]{1,3}|\\\d{1,3}/g, function(match){_x000D_
          try{_x000D_
            if (match.startsWith("\\u{"))_x000D_
              return fromCodePoint(parseInt(match.slice(2,-1),16));_x000D_
            if (match.startsWith("\\u") || match.startsWith("\\x"))_x000D_
              return fromCodePoint(parseInt(match.substring(2),16));_x000D_
            if (match.startsWith("\\0") && match.length > 2)_x000D_
              return fromCodePoint(parseInt(match.substring(2),8));_x000D_
            if (/^\\\d/.test(match)) return fromCodePoint(+match.slice(1));_x000D_
          }catch(e){return "\ufffd".repeat(match.length)}_x000D_
          return parse('"' + match + '"');_x000D_
        });_x000D_
    }_x000D_
    _x000D_
    function whenChange(){_x000D_
      try{ urlBarEle.value = "data:text/plain;charset=UTF-8;base64," + btoaUTF8(unescape(sourceEle.value), true);_x000D_
      } finally{ gotoURL(); }_x000D_
    }_x000D_
    sourceEle.addEventListener("change",whenChange,{passive:1});_x000D_
    sourceEle.addEventListener("input",whenChange,{passive:1});_x000D_
    _x000D_
    // IFrame Setup:_x000D_
    function gotoURL(){mainFrameEle.src = urlBarEle.value}_x000D_
    gotoButton.addEventListener("click", gotoURL, {passive: 1});_x000D_
    function urlChanged(){urlBarEle.value = mainFrameEle.src}_x000D_
    mainFrameEle.addEventListener("load", urlChanged, {passive: 1});_x000D_
    urlBarEle.addEventListener("keypress", function(evt){_x000D_
      if (evt.key === "enter") evt.preventDefault(), urlChanged();_x000D_
    }, {passive: 1});_x000D_
    _x000D_
        _x000D_
    var fromCharCode = String.fromCharCode;_x000D_
    var btoaUTF8 = (function(btoa, replacer){_x000D_
      "use strict";_x000D_
        return function(inputString, BOMit){_x000D_
         return btoa((BOMit?"\xEF\xBB\xBF":"") + inputString.replace(_x000D_
          /[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer_x000D_
      ));_x000D_
     }_x000D_
    })(btoa, function(nonAsciiChars){_x000D_
  "use strict";_x000D_
     // make the UTF string into a binary UTF-8 encoded string_x000D_
     var point = nonAsciiChars.charCodeAt(0);_x000D_
     if (point >= 0xD800 && point <= 0xDBFF) {_x000D_
      var nextcode = nonAsciiChars.charCodeAt(1);_x000D_
      if (nextcode !== nextcode) { // NaN because string is 1code point long_x000D_
       return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);_x000D_
      }_x000D_
      // https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae_x000D_
      if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {_x000D_
       point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;_x000D_
       if (point > 0xffff) {_x000D_
        return fromCharCode(_x000D_
         (0x1e/*0b11110*/<<3) | (point>>>18),_x000D_
         (0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),_x000D_
         (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
         (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
        );_x000D_
       }_x000D_
      } else {_x000D_
       return fromCharCode(0xef, 0xbf, 0xbd);_x000D_
      }_x000D_
     }_x000D_
     if (point <= 0x007f) { return inputString; }_x000D_
     else if (point <= 0x07ff) {_x000D_
      return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f/*00111111*/));_x000D_
     } else {_x000D_
      return fromCharCode(_x000D_
       (0xe/*0b1110*/<<4) | (point>>>12),_x000D_
       (0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
       (0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
      );_x000D_
     }_x000D_
    });_x000D_
    setTimeout(whenChange, 0);_x000D_
})(window);
_x000D_
img:active{opacity:0.8}
_x000D_
<center>_x000D_
<textarea id="source" style="width:66.7vw">Hello \u1234 W\186\0256ld!_x000D_
Enter text into the top box. Then the URL will update automatically._x000D_
</textarea><br />_x000D_
<div style="width:66.7vw;display:inline-block;height:calc(25vw + 1em + 6px);border:2px solid;text-align:left;line-height:1em">_x000D_
<input id="urlBar" style="width:calc(100% - 1em - 13px)" /><img id="gotoButton" src="" style="width:calc(1em + 4px);line-height:1em;vertical-align:-40%;cursor:pointer" />_x000D_
<iframe id="mainframe" style="width:66.7vw;height:25vw" frameBorder="0"></iframe>_x000D_
</div>_x000D_
</center>
_x000D_
_x000D_
_x000D_

In addition to being very standardized, the above code snippets are also very fast. Instead of an indirect chain of succession where the data has to be converted several times between various forms (such as in Riccardo Galli's response), the above code snippet is as direct as performantly possible. It uses only one simple fast String.prototype.replace call to process the data when encoding, and only one to decode the data when decoding. Another plus is that (especially for big strings), String.prototype.replace allows the browser to automatically handle the underlying memory management of resizing the string, leading a significant performance boost especially in evergreen browsers like Chrome and Firefox that heavily optimize String.prototype.replace. Finally, the icing on the cake is that for you latin script exclusivo users, strings which don't contain any code points above 0x7f are extra fast to process because the string remains unmodified by the replacement algorithm.

I have created a github repository for this solution at https://github.com/anonyco/BestBase64EncoderDecoder/

Examples related to javascript

need to add a class to an element How to make a variable accessible outside a function? Hide Signs that Meteor.js was Used How to create a showdown.js markdown extension Please help me convert this script to a simple image slider Highlight Anchor Links when user manually scrolls? Summing radio input values How to execute an action before close metro app WinJS javascript, for loop defines a dynamic variable name Getting all files in directory with ajax

Examples related to encoding

How to check encoding of a CSV file UnicodeEncodeError: 'ascii' codec can't encode character at special name Using Javascript's atob to decode base64 doesn't properly decode utf-8 strings What is the difference between utf8mb4 and utf8 charsets in MySQL? The character encoding of the plain text document was not declared - mootool script UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 23: ordinal not in range(128) How to encode text to base64 in python UTF-8 output from PowerShell Set Encoding of File to UTF8 With BOM in Sublime Text 3 Replace non-ASCII characters with a single space

Examples related to utf-8

error UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte Changing PowerShell's default output encoding to UTF-8 'Malformed UTF-8 characters, possibly incorrectly encoded' in Laravel Encoding Error in Panda read_csv Using Javascript's atob to decode base64 doesn't properly decode utf-8 strings What is the difference between utf8mb4 and utf8 charsets in MySQL? what is <meta charset="utf-8">? Pandas df.to_csv("file.csv" encode="utf-8") still gives trash characters for minus sign UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 23: ordinal not in range(128) Android Studio : unmappable character for encoding UTF-8