I'm using the Javascript window.atob()
function to decode a base64-encoded string (specifically the base64-encoded content from the GitHub API). Problem is I'm getting ASCII-encoded characters back (like â¢
instead of ™
). How can I properly handle the incoming base64-encoded stream so that it's decoded as utf-8?
This question is related to
javascript
encoding
utf-8
I would assume that one might want a solution that produces a widely useable base64 URI. Please visit data:text/plain;charset=utf-8;base64,4pi44pi54pi64pi74pi84pi+4pi/
to see a demonstration (copy the data uri, open a new tab, paste the data URI into the address bar, then press enter to go to the page). Despite the fact that this URI is base64-encoded, the browser is still able to recognize the high code points and decode them properly. The minified encoder+decoder is 1058 bytes (+Gzip?589 bytes)
!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)
Below is the source code used to generate it.
var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
return function(inputString, BOMit){
return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
));
}
})(btoa, function(nonAsciiChars){"use strict";
// make the UTF string into a binary UTF-8 encoded string
var point = nonAsciiChars.charCodeAt(0);
if (point >= 0xD800 && point <= 0xDBFF) {
var nextcode = nonAsciiChars.charCodeAt(1);
if (nextcode !== nextcode) // NaN because string is 1 code point long
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
if (point > 0xffff)
return fromCharCode(
(0x1e/*0b11110*/<<3) | (point>>>18),
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
} else return fromCharCode(0xef, 0xbf, 0xbd);
}
if (point <= 0x007f) return nonAsciiChars;
else if (point <= 0x07ff) {
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
} else return fromCharCode(
(0xe/*0b1110*/<<4) | (point>>>12),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
});
Then, to decode the base64 data, either HTTP get the data as a data URI or use the function below.
var clz32 = Math.clz32 || (function(log, LN2){"use strict";
return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
return function(inputString, keepBOM){
inputString = atob(inputString);
if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
inputString = inputString.substring(3); // eradicate UTF-8 BOM
// 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
// 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
}
})(atob, function(encoded){"use strict";
var codePoint = encoded.charCodeAt(0) << 24;
var leadingOnes = clz32(~codePoint);
var endPos = 0, stringLen = encoded.length;
var result = "";
if (leadingOnes < 5 && stringLen >= leadingOnes) {
codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
for (endPos = 1; endPos < leadingOnes; ++endPos)
codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
if (codePoint <= 0xFFFF) { // BMP code point
result += fromCharCode(codePoint);
} else if (codePoint <= 0x10FFFF) {
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
codePoint -= 0x10000;
result += fromCharCode(
(codePoint >> 10) + 0xD800, // highSurrogate
(codePoint & 0x3ff) + 0xDC00 // lowSurrogate
);
} else endPos = 0; // to fill it in with INVALIDs
}
for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
return result;
});
The advantage of being more standard is that this encoder and this decoder are more widely applicable because they can be used as a valid URL that displays correctly. Observe.
(function(window){_x000D_
"use strict";_x000D_
var sourceEle = document.getElementById("source");_x000D_
var urlBarEle = document.getElementById("urlBar");_x000D_
var mainFrameEle = document.getElementById("mainframe");_x000D_
var gotoButton = document.getElementById("gotoButton");_x000D_
var parseInt = window.parseInt;_x000D_
var fromCodePoint = String.fromCodePoint;_x000D_
var parse = JSON.parse;_x000D_
_x000D_
function unescape(str){_x000D_
return str.replace(/\\u[\da-f]{0,4}|\\x[\da-f]{0,2}|\\u{[^}]*}|\\[bfnrtv"'\\]|\\0[0-7]{1,3}|\\\d{1,3}/g, function(match){_x000D_
try{_x000D_
if (match.startsWith("\\u{"))_x000D_
return fromCodePoint(parseInt(match.slice(2,-1),16));_x000D_
if (match.startsWith("\\u") || match.startsWith("\\x"))_x000D_
return fromCodePoint(parseInt(match.substring(2),16));_x000D_
if (match.startsWith("\\0") && match.length > 2)_x000D_
return fromCodePoint(parseInt(match.substring(2),8));_x000D_
if (/^\\\d/.test(match)) return fromCodePoint(+match.slice(1));_x000D_
}catch(e){return "\ufffd".repeat(match.length)}_x000D_
return parse('"' + match + '"');_x000D_
});_x000D_
}_x000D_
_x000D_
function whenChange(){_x000D_
try{ urlBarEle.value = "data:text/plain;charset=UTF-8;base64," + btoaUTF8(unescape(sourceEle.value), true);_x000D_
} finally{ gotoURL(); }_x000D_
}_x000D_
sourceEle.addEventListener("change",whenChange,{passive:1});_x000D_
sourceEle.addEventListener("input",whenChange,{passive:1});_x000D_
_x000D_
// IFrame Setup:_x000D_
function gotoURL(){mainFrameEle.src = urlBarEle.value}_x000D_
gotoButton.addEventListener("click", gotoURL, {passive: 1});_x000D_
function urlChanged(){urlBarEle.value = mainFrameEle.src}_x000D_
mainFrameEle.addEventListener("load", urlChanged, {passive: 1});_x000D_
urlBarEle.addEventListener("keypress", function(evt){_x000D_
if (evt.key === "enter") evt.preventDefault(), urlChanged();_x000D_
}, {passive: 1});_x000D_
_x000D_
_x000D_
var fromCharCode = String.fromCharCode;_x000D_
var btoaUTF8 = (function(btoa, replacer){_x000D_
"use strict";_x000D_
return function(inputString, BOMit){_x000D_
return btoa((BOMit?"\xEF\xBB\xBF":"") + inputString.replace(_x000D_
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer_x000D_
));_x000D_
}_x000D_
})(btoa, function(nonAsciiChars){_x000D_
"use strict";_x000D_
// make the UTF string into a binary UTF-8 encoded string_x000D_
var point = nonAsciiChars.charCodeAt(0);_x000D_
if (point >= 0xD800 && point <= 0xDBFF) {_x000D_
var nextcode = nonAsciiChars.charCodeAt(1);_x000D_
if (nextcode !== nextcode) { // NaN because string is 1code point long_x000D_
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);_x000D_
}_x000D_
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae_x000D_
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {_x000D_
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;_x000D_
if (point > 0xffff) {_x000D_
return fromCharCode(_x000D_
(0x1e/*0b11110*/<<3) | (point>>>18),_x000D_
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
);_x000D_
}_x000D_
} else {_x000D_
return fromCharCode(0xef, 0xbf, 0xbd);_x000D_
}_x000D_
}_x000D_
if (point <= 0x007f) { return inputString; }_x000D_
else if (point <= 0x07ff) {_x000D_
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f/*00111111*/));_x000D_
} else {_x000D_
return fromCharCode(_x000D_
(0xe/*0b1110*/<<4) | (point>>>12),_x000D_
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),_x000D_
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)_x000D_
);_x000D_
}_x000D_
});_x000D_
setTimeout(whenChange, 0);_x000D_
})(window);
_x000D_
img:active{opacity:0.8}
_x000D_
<center>_x000D_
<textarea id="source" style="width:66.7vw">Hello \u1234 W\186\0256ld!_x000D_
Enter text into the top box. Then the URL will update automatically._x000D_
</textarea><br />_x000D_
<div style="width:66.7vw;display:inline-block;height:calc(25vw + 1em + 6px);border:2px solid;text-align:left;line-height:1em">_x000D_
<input id="urlBar" style="width:calc(100% - 1em - 13px)" /><img id="gotoButton" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABsAAAAeCAMAAADqx5XUAAAAclBMVEX///9NczZ8e32ko6fDxsU/fBoSQgdFtwA5pAHVxt+7vLzq5ex23y4SXABLiiTm0+/c2N6DhoQ6WSxSyweVlZVvdG/Uz9aF5kYlbwElkwAggACxs7Jl3hX07/cQbQCar5SU9lRntEWGum+C9zIDHwCGnH5IvZAOAAABmUlEQVQoz7WS25acIBBFkRLkIgKKtOCttbv//xdDmTGZzHv2S63ltuBQQP4rdRiRUP8UK4wh6nVddQwj/NtDQTvac8577zTQb72zj65/876qqt7wykU6/1U6vFEgjE1mt/5LRqrpu7oVsn0sjZejMfxR3W/yLikqAFcUx93YxLmZGOtElmEu6Ufd9xV3ZDTGcEvGLbMk0mHHlUSvS5svCwS+hVL8loQQyfpI1Ay8RF/xlNxcsTchGjGDIuBG3Ik7TMyNxn8m0TSnBAK6Z8UZfp3IbAonmJvmsEACum6aNv7B0CnvpezDcNhw9XWsuAr7qnRg6dABmeM4dTgn/DZdXWs3LMspZ1KDMt1kcPJ6S1icWNp2qaEmjq6myx7jbQK3VKItLJaW5FR+cuYlRhYNKzGa9vF4vM5roLW3OSVjkmiGJrPhUq301/16pVKZRGFYWjTP50spTxBN5Z4EKnSonruk+n4tUokv1aJSEl/MLZU90S3L6/U6o0J142iQVp3HcZxKSo8LfkNRCtJaKYFSRX7iaoAAUDty8wvWYR6HJEepdwAAAABJRU5ErkJggg==" style="width:calc(1em + 4px);line-height:1em;vertical-align:-40%;cursor:pointer" />_x000D_
<iframe id="mainframe" style="width:66.7vw;height:25vw" frameBorder="0"></iframe>_x000D_
</div>_x000D_
</center>
_x000D_
In addition to being very standardized, the above code snippets are also very fast. Instead of an indirect chain of succession where the data has to be converted several times between various forms (such as in Riccardo Galli's response), the above code snippet is as direct as performantly possible. It uses only one simple fast String.prototype.replace
call to process the data when encoding, and only one to decode the data when decoding. Another plus is that (especially for big strings), String.prototype.replace
allows the browser to automatically handle the underlying memory management of resizing the string, leading a significant performance boost especially in evergreen browsers like Chrome and Firefox that heavily optimize String.prototype.replace
. Finally, the icing on the cake is that for you latin script exclusivo users, strings which don't contain any code points above 0x7f are extra fast to process because the string remains unmodified by the replacement algorithm.
I have created a github repository for this solution at https://github.com/anonyco/BestBase64EncoderDecoder/
Decoding base64 to UTF8 String
Below is current most voted answer by @brandonscript
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
Above code can work, but it's very slow. If your input is a very large base64 string, for example 30,000 chars for a base64 html document. It will need lots of computation.
Here is my answer, use built-in TextDecoder, nearly 10x faster than above code for large input.
function decodeBase64(base64) {
const text = atob(base64);
const length = text.length;
const bytes = new Uint8Array(length);
for (let i = 0; i < length; i++) {
bytes[i] = text.charCodeAt(i);
}
const decoder = new TextDecoder(); // default is utf-8
return decoder.decode(bytes);
}
The complete article that works for me: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding
The part where we encode from Unicode/UTF-8 is
function utf8_to_b64( str ) {
return window.btoa(unescape(encodeURIComponent( str )));
}
function b64_to_utf8( str ) {
return decodeURIComponent(escape(window.atob( str )));
}
// Usage:
utf8_to_b64('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64_to_utf8('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
This is one of the most used methods nowadays.
Things change. The escape/unescape methods have been deprecated.
You can URI encode the string before you Base64-encode it. Note that this does't produce Base64-encoded UTF8, but rather Base64-encoded URL-encoded data. Both sides must agree on the same encoding.
See working example here: http://codepen.io/anon/pen/PZgbPW
// encode string
var base64 = window.btoa(encodeURIComponent('€ ?? æøåÆØÅ'));
// decode string
var str = decodeURIComponent(window.atob(tmp));
// str is now === '€ ?? æøåÆØÅ'
For OP's problem a third party library such as js-base64 should solve the problem.
If treating strings as bytes is more your thing, you can use the following functions
function u_atob(ascii) {
return Uint8Array.from(atob(ascii), c => c.charCodeAt(0));
}
function u_btoa(buffer) {
var binary = [];
var bytes = new Uint8Array(buffer);
for (var i = 0, il = bytes.byteLength; i < il; i++) {
binary.push(String.fromCharCode(bytes[i]));
}
return btoa(binary.join(''));
}
// example, it works also with astral plane characters such as ''
var encodedString = new TextEncoder().encode('?');
var base64String = u_btoa(encodedString);
console.log('?' === new TextDecoder().decode(u_atob(base64String)))
Small correction, unescape and escape are deprecated, so:
function utf8_to_b64( str ) {
return window.btoa(decodeURIComponent(encodeURIComponent(str)));
}
function b64_to_utf8( str ) {
return decodeURIComponent(encodeURIComponent(window.atob(str)));
}
function b64_to_utf8( str ) {
str = str.replace(/\s/g, '');
return decodeURIComponent(encodeURIComponent(window.atob(str)));
}
including above solution if still facing issue try as below, Considerign the case where escape is not supported for TS.
blob = new Blob(["\ufeff", csv_content]); // this will make symbols to appears in excel
for csv_content you can try like below.
function b64DecodeUnicode(str: any) {
return decodeURIComponent(atob(str).split('').map((c: any) => {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
Here is 2018 updated solution as described in the Mozilla Development Resources
TO ENCODE FROM UNICODE TO B64
function b64EncodeUnicode(str) {
// first we use encodeURIComponent to get percent-encoded UTF-8,
// then we convert the percent encodings into raw bytes which
// can be fed into btoa.
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
function toSolidBytes(match, p1) {
return String.fromCharCode('0x' + p1);
}));
}
b64EncodeUnicode('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="
TO DECODE FROM B64 TO UNICODE
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
b64DecodeUnicode('Cg=='); // "\n"
Here's some future-proof code for browsers that may lack escape/unescape()
. Note that IE 9 and older don't support atob/btoa()
, so you'd need to use custom base64 functions for them.
// Polyfill for escape/unescape
if( !window.unescape ){
window.unescape = function( s ){
return s.replace( /%([0-9A-F]{2})/g, function( m, p ) {
return String.fromCharCode( '0x' + p );
} );
};
}
if( !window.escape ){
window.escape = function( s ){
var chr, hex, i = 0, l = s.length, out = '';
for( ; i < l; i ++ ){
chr = s.charAt( i );
if( chr.search( /[A-Za-z0-9\@\*\_\+\-\.\/]/ ) > -1 ){
out += chr; continue; }
hex = s.charCodeAt( i ).toString( 16 );
out += '%' + ( hex.length % 2 != 0 ? '0' : '' ) + hex;
}
return out;
};
}
// Base64 encoding of UTF-8 strings
var utf8ToB64 = function( s ){
return btoa( unescape( encodeURIComponent( s ) ) );
};
var b64ToUtf8 = function( s ){
return decodeURIComponent( escape( atob( s ) ) );
};
A more comprehensive example for UTF-8 encoding and decoding can be found here: http://jsfiddle.net/47zwb41o/
Source: Stackoverflow.com