I would assume that one might want a solution that produces a widely useable base64 URI. Please visit data:text/plain;charset=utf-8;base64,4pi44pi54pi64pi74pi84pi+4pi/
to see a demonstration (copy the data uri, open a new tab, paste the data URI into the address bar, then press enter to go to the page). Despite the fact that this URI is base64-encoded, the browser is still able to recognize the high code points and decode them properly. The minified encoder+decoder is 1058 bytes (+Gzip?589 bytes)
!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)
Below is the source code used to generate it.
var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
return function(inputString, BOMit){
return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
));
}
})(btoa, function(nonAsciiChars){"use strict";
// make the UTF string into a binary UTF-8 encoded string
var point = nonAsciiChars.charCodeAt(0);
if (point >= 0xD800 && point <= 0xDBFF) {
var nextcode = nonAsciiChars.charCodeAt(1);
if (nextcode !== nextcode) // NaN because string is 1 code point long
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
if (point > 0xffff)
return fromCharCode(
(0x1e/*0b11110*/<<3) | (point>>>18),
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
} else return fromCharCode(0xef, 0xbf, 0xbd);
}
if (point <= 0x007f) return nonAsciiChars;
else if (point <= 0x07ff) {
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
} else return fromCharCode(
(0xe/*0b1110*/<<4) | (point>>>12),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
});
Then, to decode the base64 data, either HTTP get the data as a data URI or use the function below.
var clz32 = Math.clz32 || (function(log, LN2){"use strict";
return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
return function(inputString, keepBOM){
inputString = atob(inputString);
if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
inputString = inputString.substring(3); // eradicate UTF-8 BOM
// 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
// 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
}
})(atob, function(encoded){"use strict";
var codePoint = encoded.charCodeAt(0) << 24;
var leadingOnes = clz32(~codePoint);
var endPos = 0, stringLen = encoded.length;
var result = "";
if (leadingOnes < 5 && stringLen >= leadingOnes) {
codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
for (endPos = 1; endPos < leadingOnes; ++endPos)
codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
if (codePoint <= 0xFFFF) { // BMP code point
result += fromCharCode(codePoint);
} else if (codePoint <= 0x10FFFF) {
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
codePoint -= 0x10000;
result += fromCharCode(
(codePoint >> 10) + 0xD800, // highSurrogate
(codePoint & 0x3ff) + 0xDC00 // lowSurrogate
);
} else endPos = 0; // to fill it in with INVALIDs
}
for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
return result;
});
The advantage of being more standard is that this encoder and this decoder are more widely applicable because they can be used as a valid URL that displays correctly. Observe.
(function(window){_x000D_
"use strict";_x000D_
var sourceEle = document.getElementById("source");_x000D_
var urlBarEle = document.getElementById("urlBar");_x000D_
var mainFrameEle = document.getElementById("mainframe");_x000D_
var gotoButton = document.getElementById("gotoButton");_x000D_
var parseInt = window.parseInt;_x000D_
var fromCodePoint = String.fromCodePoint;_x000D_
var parse = JSON.parse;_x000D_
_x000D_
function unescape(str){_x000D_
return str.replace(/\\u[\da-f]{0,4}|\\x[\da-f]{0,2}|\\u{[^}]*}|\\[bfnrtv"'\\]|\\0[0-7]{1,3}|\\\d{1,3}/g, function(match){_x000D_
try{_x000D_
if (match.startsWith("\\u{"))_x000D_
return fromCodePoint(parseInt(match.slice(2,-1),16));_x000D_
if (match.startsWith("\\u") || match.startsWith("\\x"))_x000D_
return fromCodePoint(parseInt(match.substring(2),16));_x000D_
if (match.startsWith("\\0") && match.length > 2)_x000D_
return fromCodePoint(parseInt(match.substring(2),8));_x000D_
if (/^\\\d/.test(match)) return fromCodePoint(+match.slice(1));_x000D_
}catch(e){return "\ufffd".repeat(match.length)}_x000D_
return parse('"' + match + '"');_x000D_
});_x000D_
}_x000D_
_x000D_
function whenChange(){_x000D_
try{ urlBarEle.value = "data:text/plain;charset=UTF-8;base64," + btoaUTF8(unescape(sourceEle.value), true);_x000D_
} finally{ gotoURL(); }_x000D_
}_x000D_
sourceEle.addEventListener("change",whenChange,{passive:1});_x000D_
sourceEle.addEventListener("input",whenChange,{passive:1});_x000D_
_x000D_
// IFrame Setup:_x000D_
function gotoURL(){mainFrameEle.src = urlBarEle.value}_x000D_
gotoButton.addEventListener("click", gotoURL, {passive: 1});_x000D_
function urlChanged(){urlBarEle.value = mainFrameEle.src}_x000D_
mainFrameEle.addEventListener("load", urlChanged, {passive: 1});_x000D_
urlBarEle.addEventListener("keypress", function(evt){_x000D_
if (evt.key === "enter") evt.preventDefault(), urlChanged();_x000D_
}, {passive: 1});_x000D_
_x000D_
_x000D_
var fromCharCode = String.fromCharCode;_x000D_
var btoaUTF8 = (function(btoa, replacer){_x000D_
"use strict";_x000D_
return function(inputString, BOMit){_x000D_
return btoa((BOMit?"\xEF\xBB\xBF":"") + inputString.replace(_x000D_
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer_x000D_
));_x000D_
}_x000D_
})(btoa, function(nonAsciiChars){_x000D_
"use strict";_x000D_
// make the UTF string into a binary UTF-8 encoded string_x000D_
var point = nonAsciiChars.charCodeAt(0);_x000D_
if (point >= 0xD800 && point <= 0xDBFF) {_x000D_
var nextcode = nonAsciiChars.charCodeAt(1);_x000D_
if (nextcode !== nextcode) { // NaN because string is 1code point long_x000D_
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);_x000D_
}_x000D_
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae_x000D_
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {_x000D_
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;_x000D_
if (point > 0xffff) {_x000D_
return fromCharCode(_x000D_
(0x1e/*0b11110*/<<3) | (point>>>18),_x000D_
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
);_x000D_
}_x000D_
} else {_x000D_
return fromCharCode(0xef, 0xbf, 0xbd);_x000D_
}_x000D_
}_x000D_
if (point <= 0x007f) { return inputString; }_x000D_
else if (point <= 0x07ff) {_x000D_
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f/*00111111*/));_x000D_
} else {_x000D_
return fromCharCode(_x000D_
(0xe/*0b1110*/<<4) | (point>>>12),_x000D_
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
);_x000D_
}_x000D_
});_x000D_
setTimeout(whenChange, 0);_x000D_
})(window);
_x000D_
img:active{opacity:0.8}
_x000D_
<center>_x000D_
<textarea id="source" style="width:66.7vw">Hello \u1234 W\186\0256ld!_x000D_
Enter text into the top box. Then the URL will update automatically._x000D_
</textarea><br />_x000D_
<div style="width:66.7vw;display:inline-block;height:calc(25vw + 1em + 6px);border:2px solid;text-align:left;line-height:1em">_x000D_
<input id="urlBar" style="width:calc(100% - 1em - 13px)" /><img id="gotoButton" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABsAAAAeCAMAAADqx5XUAAAAclBMVEX///9NczZ8e32ko6fDxsU/fBoSQgdFtwA5pAHVxt+7vLzq5ex23y4SXABLiiTm0+/c2N6DhoQ6WSxSyweVlZVvdG/Uz9aF5kYlbwElkwAggACxs7Jl3hX07/cQbQCar5SU9lRntEWGum+C9zIDHwCGnH5IvZAOAAABmUlEQVQoz7WS25acIBBFkRLkIgKKtOCttbv//xdDmTGZzHv2S63ltuBQQP4rdRiRUP8UK4wh6nVddQwj/NtDQTvac8577zTQb72zj65/876qqt7wykU6/1U6vFEgjE1mt/5LRqrpu7oVsn0sjZejMfxR3W/yLikqAFcUx93YxLmZGOtElmEu6Ufd9xV3ZDTGcEvGLbMk0mHHlUSvS5svCwS+hVL8loQQyfpI1Ay8RF/xlNxcsTchGjGDIuBG3Ik7TMyNxn8m0TSnBAK6Z8UZfp3IbAonmJvmsEACum6aNv7B0CnvpezDcNhw9XWsuAr7qnRg6dABmeM4dTgn/DZdXWs3LMspZ1KDMt1kcPJ6S1icWNp2qaEmjq6myx7jbQK3VKItLJaW5FR+cuYlRhYNKzGa9vF4vM5roLW3OSVjkmiGJrPhUq301/16pVKZRGFYWjTP50spTxBN5Z4EKnSonruk+n4tUokv1aJSEl/MLZU90S3L6/U6o0J142iQVp3HcZxKSo8LfkNRCtJaKYFSRX7iaoAAUDty8wvWYR6HJEepdwAAAABJRU5ErkJggg==" style="width:calc(1em + 4px);line-height:1em;vertical-align:-40%;cursor:pointer" />_x000D_
<iframe id="mainframe" style="width:66.7vw;height:25vw" frameBorder="0"></iframe>_x000D_
</div>_x000D_
</center>
_x000D_
In addition to being very standardized, the above code snippets are also very fast. Instead of an indirect chain of succession where the data has to be converted several times between various forms (such as in Riccardo Galli's response), the above code snippet is as direct as performantly possible. It uses only one simple fast String.prototype.replace
call to process the data when encoding, and only one to decode the data when decoding. Another plus is that (especially for big strings), String.prototype.replace
allows the browser to automatically handle the underlying memory management of resizing the string, leading a significant performance boost especially in evergreen browsers like Chrome and Firefox that heavily optimize String.prototype.replace
. Finally, the icing on the cake is that for you latin script exclusivo users, strings which don't contain any code points above 0x7f are extra fast to process because the string remains unmodified by the replacement algorithm.
I have created a github repository for this solution at https://github.com/anonyco/BestBase64EncoderDecoder/