[javascript] Encode html entities in javascript

I am working in a CMS which allows users to enter content. The problem is that when they add symbols ® , it may not display well in all browsers. I would like to set up a list of symbols that must be searched for, and then converted to the corresponding html entity. For example

® => ®
& => &
© => ©
™ => ™

After the conversion, it needs to be wrapped in a <sup> tag, resulting in this:

® => <sup>&reg;</sup>

Because a particular font size and padding style is necessary:

sup { font-size: 0.6em; padding-top: 0.2em; }

Would the JavaScript be something like this?

var regs = document.querySelectorAll('®');
  for ( var i = 0, l = imgs.length; i < l; ++i ) {
  var [?] = regs[i];
  var [?] = document.createElement('sup');
  img.parentNode.insertBefore([?]);
  div.appendChild([?]);
}

Where "[?]" means that there is something that I am not sure about.

Additional Details:

  • I would like to do this with pure JavaScript, not something that requires a library like jQuery, thanks.
  • Backend is Ruby
  • Using RefineryCMS which is built with Ruby on Rails

This question is related to javascript html

The answer is


replaceHtmlEntities(text) {
  var tagsToReplace = {
    '&amp;': '&',
    '&lt;': '<',
    '&gt;': '>',
  };
  var newtext = text;
  for (var tag in tagsToReplace) {
    if (Reflect.apply({}.hasOwnProperty, this, [tagsToReplace, tag])) {
      var regex = new RegExp(tag, 'g');
      newtext = newtext.replace(regex, tagsToReplace[tag]);
    }
  }
  return newtext;
}

Checkout the tutorial from Ourcodeworld Ourcodeworld - encode and decode html entities with javascript

Most importantly, the he library example

he.encode('foo © bar ? baz ???? qux');
// ? 'foo &#xA9; bar &#x2260; baz &#x1D306; qux'

// Passing an `options` object to `encode`, to explicitly encode all symbols:
he.encode('foo © bar ? baz ???? qux', {
 'encodeEverything': true
});

he.decode('foo &copy; bar &ne; baz &#x1D306; qux');
// ? 'foo © bar ? baz ???? qux'

This library would probably make your coding easier and better managed. It is popular, regularly updated and follows the HTML spec. It itself has no dependencies, as can be seen in the package.json


HTML Special Characters & its ESCAPE CODES

Reserved Characters must be escaped by HTML: We can use a character escape to represent any Unicode character [Ex: & - U+00026] in HTML, XHTML or XML using only ASCII characters. Numeric character references [Ex: ampersand(&) - &#38;] & Named character references [Ex: &amp;] are types of character escape used in markup.


Predefined Entities

    Original Character     XML entity replacement    XML numeric replacement  
                  <                                    &lt;                                           &#60;                    
                  >                                     &gt;                                         &#62;                    
                  "                                     &quot;                                      &#34;                    
                  &                                   &amp;                                       &#38;                    
                   '                                    &apos;                                      &#39;                    

To display HTML Tags as a normal form in web page we use <pre>, <code> tags or we can escape them. Escaping the string by replacing with any occurrence of the "&" character by the string "&amp;" and any occurrences of the ">" character by the string "&gt;". Ex: stackoverflow post

function escapeCharEntities() {
    var map = {
        "&": "&amp;",
        "<": "&lt;",
        ">": "&gt;",
        "\"": "&quot;",
        "'": "&apos;"
    };
    return map;
}

var mapkeys = '', mapvalues = '';
var html = {
    encodeRex : function () {
        return  new RegExp(mapkeys, 'g'); // "[&<>"']"
    }, 
    decodeRex : function () {
        return  new RegExp(mapvalues, 'g'); // "(&amp;|&lt;|&gt;|&quot;|&apos;)"
    },
    encodeMap : JSON.parse( JSON.stringify( escapeCharEntities () ) ), // json = {&: "&amp;", <: "&lt;", >: "&gt;", ": "&quot;", ': "&apos;"}
    decodeMap : JSON.parse( JSON.stringify( swapJsonKeyValues( escapeCharEntities () ) ) ),
    encode : function ( str ) {
        var encodeRexs = html.encodeRex();
        console.log('Encode Rex: ', encodeRexs); // /[&<>"']/gm
        return str.replace(encodeRexs, function(m) { console.log('Encode M: ', m); return html.encodeMap[m]; }); // m = < " > SpecialChars
    },
    decode : function ( str ) {
        var decodeRexs = html.decodeRex();
        console.log('Decode Rex: ', decodeRexs); // /(&amp;|&lt;|&gt;|&quot;|&apos;)/g
        return str.replace(decodeRexs, function(m) { console.log('Decode M: ', m); return html.decodeMap[m]; }); // m = &lt; &quot; &gt;
    }
};

function swapJsonKeyValues ( json ) {
    var count = Object.keys( json ).length;
    var obj = {};
    var keys = '[', val = '(', keysCount = 1;
    for(var key in json) {
        if ( json.hasOwnProperty( key ) ) {
            obj[ json[ key ] ] = key;
            keys += key;
            if( keysCount < count ) {
                val += json[ key ]+'|';
            } else {
                val += json[ key ];
            }
            keysCount++;
        }
    }
    keys += ']';    val  += ')';
    console.log( keys, ' == ', val);
    mapkeys = keys;
    mapvalues = val;
    return obj;
}

console.log('Encode: ', html.encode('<input type="password" name="password" value=""/>') ); 
console.log('Decode: ', html.decode(html.encode('<input type="password" name="password" value=""/>')) );

O/P:
Encode:  &lt;input type=&quot;password&quot; name=&quot;password&quot; value=&quot;&quot;/&gt;
Decode:  <input type="password" name="password" value=""/>

I had the same problem and created 2 functions to create entities and translate them back to normal characters. The following methods translate any string to HTML entities and back on String prototype

/**
 * Convert a string to HTML entities
 */
String.prototype.toHtmlEntities = function() {
    return this.replace(/./gm, function(s) {
        // return "&#" + s.charCodeAt(0) + ";";
        return (s.match(/[a-z0-9\s]+/i)) ? s : "&#" + s.charCodeAt(0) + ";";
    });
};

/**
 * Create string from HTML entities
 */
String.fromHtmlEntities = function(string) {
    return (string+"").replace(/&#\d+;/gm,function(s) {
        return String.fromCharCode(s.match(/\d+/gm)[0]);
    })
};

You can then use it as following:

var str = "Test´†®¥¨©??ø…ˆƒ?÷?™ƒ?æøp£¨ ƒ™en tést".toHtmlEntities();
console.log("Entities:", str);
console.log("String:", String.fromHtmlEntities(str));

Output in console:

Entities: &#68;&#105;&#116;&#32;&#105;&#115;&#32;&#101;&#180;&#8224;&#174;&#165;&#168;&#169;&#729;&#8747;&#248;&#8230;&#710;&#402;&#8710;&#247;&#8721;&#8482;&#402;&#8710;&#230;&#248;&#960;&#163;&#168;&#160;&#402;&#8482;&#101;&#110;&#32;&#116;&#163;&#101;&#233;&#115;&#116;
String: Dit is e´†®¥¨©??ø…ˆƒ?÷?™ƒ?æøp£¨ ƒ™en t£eést 

var htmlEntities = [
            {regex:/&/g,entity:'&amp;'},
            {regex:/>/g,entity:'&gt;'},
            {regex:/</g,entity:'&lt;'},
            {regex:/"/g,entity:'&quot;'},
            {regex:/á/g,entity:'&aacute;'},
            {regex:/é/g,entity:'&eacute;'},
            {regex:/í/g,entity:'&iacute;'},
            {regex:/ó/g,entity:'&oacute;'},
            {regex:/ú/g,entity:'&uacute;'}
        ];

total = <some string value>

for(v in htmlEntities){
    total = total.replace(htmlEntities[v].regex, htmlEntities[v].entity);
}

A array solution


Without any library, if you do not need to support IE < 9, you could create a html element and set its content with Node.textContent:

var str = "<this is not a tag>";
var p = document.createElement("p");
p.textContent = str;
var converted = p.innerHTML;

Here is an example: https://jsfiddle.net/1erdhehv/

Update: This only works for HTML tag entities (&, <, and >).


Sometimes you just want to encode every character... This function replaces "everything but nothing" in regxp.

function encode(e){return e.replace(/[^]/g,function(e){return"&#"+e.charCodeAt(0)+";"})}

_x000D_
_x000D_
function encode(w) {_x000D_
  return w.replace(/[^]/g, function(w) {_x000D_
    return "&#" + w.charCodeAt(0) + ";";_x000D_
  });_x000D_
}_x000D_
_x000D_
test.value=encode(document.body.innerHTML.trim());
_x000D_
<textarea id=test rows=11 cols=55>www.WHAK.com</textarea>
_x000D_
_x000D_
_x000D_


Here is how I implemented the encoding. I took inspiration from the answers given above.

_x000D_
_x000D_
function encodeHTML(str) {
  const code = {
      ' ' : '&nbsp;',
      '¢' : '&cent;',
      '£' : '&pound;',
      '¥' : '&yen;',
      '€' : '&euro;', 
      '©' : '&copy;',
      '®' : '&reg;',
      '<' : '&lt;', 
      '>' : '&gt;',  
      '"' : '&quot;', 
      '&' : '&amp;',
      '\'' : '&apos;'
  };
  return str.replace(/[\u00A0-\u9999<>\&''""]/gm, (i)=>code[i]);
}

// TEST
console.log(encodeHTML("Dolce & Gabbana"));
console.log(encodeHTML("Hamburgers < Pizza < Tacos"));
console.log(encodeHTML("Sixty > twelve"));
console.log(encodeHTML('Stuff in "quotation marks"'));
console.log(encodeHTML("Schindler's List"));
console.log(encodeHTML("<>"));
_x000D_
_x000D_
_x000D_


You can use the charCodeAt() method to check if the specified character has a value higher than 127 and convert it to a numeric character reference using toString(16).


_x000D_
_x000D_
<!DOCTYPE html>_x000D_
<html>_x000D_
<style>_x000D_
button {_x000D_
backround: #ccc;_x000D_
padding: 14px;_x000D_
width: 400px;_x000D_
font-size: 32px;_x000D_
}_x000D_
#demo {_x000D_
font-size: 20px;_x000D_
font-family: Arial;_x000D_
font-weight: bold;_x000D_
}_x000D_
</style>_x000D_
<body>_x000D_
_x000D_
<p>Click the button to decode.</p>_x000D_
_x000D_
<button onclick="entitycode()">Html Code</button>_x000D_
_x000D_
<p id="demo"></p>_x000D_
_x000D_
_x000D_
<script>_x000D_
function entitycode() {_x000D_
  var uri = "quotation  = ark __ &apos; = apostrophe  __ &amp; = ampersand __ &lt; = less-than __ &gt; = greater-than __  non- = reaking space __ &iexcl; = inverted exclamation mark __ &cent; = cent __ &pound; = pound __ &curren; = currency __ &yen; = yen __ &brvbar; = broken vertical bar __ &sect; = section __ &uml; = spacing diaeresis __ &copy; = copyright __ &ordf; = feminine ordinal indicator __ &laquo; = angle quotation mark (left) __ &not; = negation __ &shy; = soft hyphen __ &reg; = registered trademark __ &macr; = spacing macron __ &deg; = degree __ &plusmn; = plus-or-minus  __ &sup2; = superscript 2 __ &sup3; = superscript 3 __ &acute; = spacing acute __ &micro; = micro __ &para; = paragraph __ &middot; = middle dot __ &cedil; = spacing cedilla __ &sup1; = superscript 1 __ &ordm; = masculine ordinal indicator __ &raquo; = angle quotation mark (right) __ &frac14; = fraction 1/4 __ &frac12; = fraction 1/2 __ &frac34; = fraction 3/4 __ &iquest; = inverted question mark __ &times; = multiplication __ &divide; = division __ &Agrave; = capital a, grave accent __ &Aacute; = capital a, acute accent __ &Acirc; = capital a, circumflex accent __ &Atilde; = capital a, tilde __ &Auml; = capital a, umlaut mark __ &Aring; = capital a, ring __ &AElig; = capital ae __ &Ccedil; = capital c, cedilla __ &Egrave; = capital e, grave accent __ &Eacute; = capital e, acute accent __ &Ecirc; = capital e, circumflex accent __ &Euml; = capital e, umlaut mark __ &Igrave; = capital i, grave accent __ &Iacute; = capital i, acute accent __ &Icirc; = capital i, circumflex accent __ &Iuml; = capital i, umlaut mark __ &ETH; = capital eth, Icelandic __ &Ntilde; = capital n, tilde __ &Ograve; = capital o, grave accent __ &Oacute; = capital o, acute accent __ &Ocirc; = capital o, circumflex accent __ &Otilde; = capital o, tilde __ &Ouml; = capital o, umlaut mark __ &Oslash; = capital o, slash __ &Ugrave; = capital u, grave accent __ &Uacute; = capital u, acute accent __ &Ucirc; = capital u, circumflex accent __ &Uuml; = capital u, umlaut mark __ &Yacute; = capital y, acute accent __ &THORN; = capital THORN, Icelandic __ &szlig; = small sharp s, German __ &agrave; = small a, grave accent __ &aacute; = small a, acute accent __ &acirc; = small a, circumflex accent __ &atilde; = small a, tilde __ &auml; = small a, umlaut mark __ &aring; = small a, ring __ &aelig; = small ae __ &ccedil; = small c, cedilla __ &egrave; = small e, grave accent __ &eacute; = small e, acute accent __ &ecirc; = small e, circumflex accent __ &euml; = small e, umlaut mark __ &igrave; = small i, grave accent __ &iacute; = small i, acute accent __ &icirc; = small i, circumflex accent __ &iuml; = small i, umlaut mark __ &eth; = small eth, Icelandic __ &ntilde; = small n, tilde __ &ograve; = small o, grave accent __ &oacute; = small o, acute accent __ &ocirc; = small o, circumflex accent __ &otilde; = small o, tilde __ &ouml; = small o, umlaut mark __ &oslash; = small o, slash __ &ugrave; = small u, grave accent __ &uacute; = small u, acute accent __ &ucirc; = small u, circumflex accent __ &uuml; = small u, umlaut mark __ &yacute; = small y, acute accent __ &thorn; = small thorn, Icelandic __ &yuml; = small y, umlaut mark";_x000D_
  var enc = encodeURI(uri);_x000D_
  var dec = decodeURI(enc);_x000D_
  var res = dec;_x000D_
  document.getElementById("demo").innerHTML = res;_x000D_
}_x000D_
</script>_x000D_
_x000D_
</body>_x000D_
</html>
_x000D_
_x000D_
_x000D_


You can use this.

var escapeChars = {
  '¢' : 'cent',
  '£' : 'pound',
  '¥' : 'yen',
  '€': 'euro',
  '©' :'copy',
  '®' : 'reg',
  '<' : 'lt',
  '>' : 'gt',
  '"' : 'quot',
  '&' : 'amp',
  '\'' : '#39'
};

var regexString = '[';
for(var key in escapeChars) {
  regexString += key;
}
regexString += ']';

var regex = new RegExp( regexString, 'g');

function escapeHTML(str) {
  return str.replace(regex, function(m) {
    return '&' + escapeChars[m] + ';';
  });
};

https://github.com/epeli/underscore.string/blob/master/escapeHTML.js

var htmlEntities = {
    nbsp: ' ',
    cent: '¢',
    pound: '£',
    yen: '¥',
    euro: '€',
    copy: '©',
    reg: '®',
    lt: '<',
    gt: '>',
    quot: '"',
    amp: '&',
    apos: '\''
};

function unescapeHTML(str) {
    return str.replace(/\&([^;]+);/g, function (entity, entityCode) {
        var match;

        if (entityCode in htmlEntities) {
            return htmlEntities[entityCode];
            /*eslint no-cond-assign: 0*/
        } else if (match = entityCode.match(/^#x([\da-fA-F]+)$/)) {
            return String.fromCharCode(parseInt(match[1], 16));
            /*eslint no-cond-assign: 0*/
        } else if (match = entityCode.match(/^#(\d+)$/)) {
            return String.fromCharCode(~~match[1]);
        } else {
            return entity;
        }
    });
};

If you're already using jQuery, try html().

$('<div>').text('<script>alert("gotcha!")</script>').html()
// "&lt;script&gt;alert("gotcha!")&lt;/script&gt;"

An in-memory text node is instantiated, and html() is called on it.

It's ugly, it wastes a bit of memory, and I have no idea if it's as thorough as something like the he library but if you're already using jQuery, maybe this is an option for you.

Taken from blog post Encode HTML entities with jQuery by Felix Geisendörfer.


one of the Easy Way for Encode Or Decode HTML-entities
just Call a Function with one argument...

Decode HTML-entities

function decodeHTMLEntities(text) {
  var textArea = document.createElement('textarea');
  textArea.innerHTML = text;
  return textArea.value;
}

Decode HTML-entities (JQuery)

function decodeHTMLEntities(text) {
  return $("<textarea/>").html(text).text();
}

Encode HTML-entities

function encodeHTMLEntities(text) {
  var textArea = document.createElement('textarea');
  textArea.innerText = text;
  return textArea.innerHTML;
}

Encode HTML-entities (JQuery)

function encodeHTMLEntities(text) {
  return $("<textarea/>").text(text).html();
}

htmlentities() converts HTML Entities

So we build a constant that will contain our html tags we want to convert.

const htmlEntities = [ 
    {regex:'&',entity:'&amp;'},
    {regex:'>',entity:'&gt;'},
    {regex:'<',entity:'&lt;'} 
  ];

We build a function that will convert all corresponding html characters to string : Html ==> String

 function htmlentities (s){
    var reg; 
    for (v in htmlEntities) {
      reg = new RegExp(htmlEntities[v].regex, 'g');
      s = s.replace(reg, htmlEntities[v].entity);
    }
    return s;
  }

To decode, we build a reverse function that will convert all string to their equivalent html . String ==> html

 function  html_entities_decode (s){
    var reg; 
    for (v in htmlEntities) {
      reg = new RegExp(htmlEntities[v].entity, 'g');
      s = s.replace(reg, htmlEntities[v].regex);
    }
    return s;
  
   }

After, We can encode all others special characters (é è ...) with encodeURIComponent()

Use Case

 var s  = '<div> God bless you guy   </div> '
 var h = encodeURIComponent(htmlentities(s));         /** To encode */
 h =  html_entities_decode(decodeURIComponent(h));     /** To decode */

If you want to avoid encode html entities more than once

function encodeHTML(str){
    return str.replace(/([\u00A0-\u9999<>&])(.|$)/g, function(full, char, next) {
      if(char !== '&' || next !== '#'){
        if(/[\u00A0-\u9999<>&]/.test(next))
          next = '&#' + next.charCodeAt(0) + ';';

        return '&#' + char.charCodeAt(0) + ';' + next;
      }

      return full;
    });
}

function decodeHTML(str){
    return str.replace(/&#([0-9]+);/g, function(full, int) {
        return String.fromCharCode(parseInt(int));
    });
}

# Example

var text = "<a>Content &#169; <#>&<&#># </a>";

text = encodeHTML(text);
console.log("Encode 1 times: " + text);

// &#60;a&#62;Content &#169; &#60;#&#62;&#38;&#60;&#38;#&#62;# &#60;/a&#62;

text = encodeHTML(text);
console.log("Encode 2 times: " + text);

// &#60;a&#62;Content &#169; &#60;#&#62;&#38;&#60;&#38;#&#62;# &#60;/a&#62;

text = decodeHTML(text);
console.log("Decoded: " + text);

// <a>Content © <#>&<&#># </a>

The currently accepted answer has several issues. This post explains them, and offers a more robust solution. The solution suggested in that answer previously had:

var encodedStr = rawStr.replace(/[\u00A0-\u9999<>\&]/gim, function(i) {
  return '&#' + i.charCodeAt(0) + ';';
});

The i flag is redundant since no Unicode symbol in the range from U+00A0 to U+9999 has an uppercase/lowercase variant that is outside of that same range.

The m flag is redundant because ^ or $ are not used in the regular expression.

Why the range U+00A0 to U+9999? It seems arbitrary.

Anyway, for a solution that correctly encodes all except safe & printable ASCII symbols in the input (including astral symbols!), and implements all named character references (not just those in HTML4), use the he library (disclaimer: This library is mine). From its README:

he (for “HTML entities”) is a robust HTML entity encoder/decoder written in JavaScript. It supports all standardized named character references as per HTML, handles ambiguous ampersands and other edge cases just like a browser would, has an extensive test suite, and — contrary to many other JavaScript solutions — he handles astral Unicode symbols just fine. An online demo is available.

Also see this relevant Stack Overflow answer.