For a poor man's implementation of near-collation-correct sorting on the client side I need a JavaScript function that does efficient single character replacement in a string.
Here is what I mean (note that this applies to German text, other languages sort differently):
native sorting gets it wrong: a b c o u z ä ö ü collation-correct would be: a ä b c o ö u ü z
Basically, I need all occurrences of "ä" of a given string replaced with "a" (and so on). This way the result of native sorting would be very close to what a user would expect (or what a database would return).
Other languages have facilities to do just that: Python supplies str.translate()
, in Perl there is tr/…/…/
, XPath has a function translate()
, ColdFusion has ReplaceList()
. But what about JavaScript?
Here is what I have right now.
// s would be a rather short string (something like
// 200 characters at max, most of the time much less)
function makeSortString(s) {
var translate = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U" // probably more to come
};
var translate_re = /[öäüÖÄÜ]/g;
return ( s.replace(translate_re, function(match) {
return translate[match];
}) );
}
For starters, I don't like the fact that the regex is rebuilt every time I call the function. I guess a closure can help in this regard, but I don't seem to get the hang of it for some reason.
Can someone think of something more efficient?
String#localeCompare
, which is now widely supported among JS engines (not so much at the time of the question) and could solve this category of problem much more elegantly.This question is related to
javascript
sorting
string
collation
Not a single answer mentions String.localeCompare
, which happens to do exactly what you originally wanted, but not what you're asking for.
var list = ['a', 'b', 'c', 'o', 'u', 'z', 'ä', 'ö', 'ü'];
list.sort((a, b) => a.localeCompare(b));
console.log(list);
//Outputs ['a', 'ä', 'b', 'c', 'o', 'ö', 'u', 'ü', 'z']
The second and third parameter are not supported by older browsers though. It's an option worth considering nonetheless.
A direct port to javascript of Kierons solution: https://github.com/rwarasaurus/nano/blob/master/system/helpers.php#L61-73:
/**
* Normalise a string replacing foreign characters
*
* @param {String} str
* @return {String} str
*/
var normalize = (function () {
var a = ['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'Ð', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', '?', '?', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', '?', '?', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', '?', 'O', 'o', 'O', 'o', 'O', 'o', 'Œ', 'œ', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'Š', 'š', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Ÿ', 'Z', 'z', 'Z', 'z', 'Ž', 'ž', '?', 'ƒ', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', '?', '?', '?', '?', '?', '?'];
var b = ['A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o'];
return function (str) {
var i = a.length;
while (i--) str = str.replace(a[i], b[i]);
return str;
};
}());
And a slightly modified version, using a char-map instead of two arrays:
To compare these two methods I made a simple benchmark: http://jsperf.com/replace-foreign-characters
/**
* Normalise a string replacing foreign characters
*
* @param {String} str
* @return {String}
*/
var normalize = (function () {
var map = {
"À": "A",
"Á": "A",
"Â": "A",
"Ã": "A",
"Ä": "A",
"Å": "A",
"Æ": "AE",
"Ç": "C",
"È": "E",
"É": "E",
"Ê": "E",
"Ë": "E",
"Ì": "I",
"Í": "I",
"Î": "I",
"Ï": "I",
"Ð": "D",
"Ñ": "N",
"Ò": "O",
"Ó": "O",
"Ô": "O",
"Õ": "O",
"Ö": "O",
"Ø": "O",
"Ù": "U",
"Ú": "U",
"Û": "U",
"Ü": "U",
"Ý": "Y",
"ß": "s",
"à": "a",
"á": "a",
"â": "a",
"ã": "a",
"ä": "a",
"å": "a",
"æ": "ae",
"ç": "c",
"è": "e",
"é": "e",
"ê": "e",
"ë": "e",
"ì": "i",
"í": "i",
"î": "i",
"ï": "i",
"ñ": "n",
"ò": "o",
"ó": "o",
"ô": "o",
"õ": "o",
"ö": "o",
"ø": "o",
"ù": "u",
"ú": "u",
"û": "u",
"ü": "u",
"ý": "y",
"ÿ": "y",
"A": "A",
"a": "a",
"A": "A",
"a": "a",
"A": "A",
"a": "a",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"D": "D",
"d": "d",
"Ð": "D",
"d": "d",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"H": "H",
"h": "h",
"H": "H",
"h": "h",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"?": "IJ",
"?": "ij",
"J": "J",
"j": "j",
"K": "K",
"k": "k",
"L": "L",
"l": "l",
"L": "L",
"l": "l",
"L": "L",
"l": "l",
"?": "L",
"?": "l",
"L": "l",
"l": "l",
"N": "N",
"n": "n",
"N": "N",
"n": "n",
"N": "N",
"n": "n",
"?": "n",
"O": "O",
"o": "o",
"O": "O",
"o": "o",
"O": "O",
"o": "o",
"Œ": "OE",
"œ": "oe",
"R": "R",
"r": "r",
"R": "R",
"r": "r",
"R": "R",
"r": "r",
"S": "S",
"s": "s",
"S": "S",
"s": "s",
"S": "S",
"s": "s",
"Š": "S",
"š": "s",
"T": "T",
"t": "t",
"T": "T",
"t": "t",
"T": "T",
"t": "t",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"W": "W",
"w": "w",
"Y": "Y",
"y": "y",
"Ÿ": "Y",
"Z": "Z",
"z": "z",
"Z": "Z",
"z": "z",
"Ž": "Z",
"ž": "z",
"?": "s",
"ƒ": "f",
"O": "O",
"o": "o",
"U": "U",
"u": "u",
"A": "A",
"a": "a",
"I": "I",
"i": "i",
"O": "O",
"o": "o",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"?": "A",
"?": "a",
"?": "AE",
"?": "ae",
"?": "O",
"?": "o"
},
nonWord = /\W/g,
mapping = function (c) {
return map[c] || c;
};
return function (str) {
return str.replace(nonWord, mapping);
};
}());
The correct terminology for such accents is Diacritics. After Googling this term, I found this function which is part of backbone.paginator
. It has a very complete collection of Diacritics and replaces them with their most intuitive ascii character. I found this to be the most complete Javascript solution available today.
The full function for future reference:
function removeDiacritics (str) {
var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':/[\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F]/g},
{'base':'AA','letters':/[\uA732]/g},
{'base':'AE','letters':/[\u00C6\u01FC\u01E2]/g},
{'base':'AO','letters':/[\uA734]/g},
{'base':'AU','letters':/[\uA736]/g},
{'base':'AV','letters':/[\uA738\uA73A]/g},
{'base':'AY','letters':/[\uA73C]/g},
{'base':'B', 'letters':/[\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181]/g},
{'base':'C', 'letters':/[\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E]/g},
{'base':'D', 'letters':/[\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779]/g},
{'base':'DZ','letters':/[\u01F1\u01C4]/g},
{'base':'Dz','letters':/[\u01F2\u01C5]/g},
{'base':'E', 'letters':/[\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E]/g},
{'base':'F', 'letters':/[\u0046\u24BB\uFF26\u1E1E\u0191\uA77B]/g},
{'base':'G', 'letters':/[\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E]/g},
{'base':'H', 'letters':/[\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D]/g},
{'base':'I', 'letters':/[\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197]/g},
{'base':'J', 'letters':/[\u004A\u24BF\uFF2A\u0134\u0248]/g},
{'base':'K', 'letters':/[\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2]/g},
{'base':'L', 'letters':/[\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780]/g},
{'base':'LJ','letters':/[\u01C7]/g},
{'base':'Lj','letters':/[\u01C8]/g},
{'base':'M', 'letters':/[\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C]/g},
{'base':'N', 'letters':/[\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4]/g},
{'base':'NJ','letters':/[\u01CA]/g},
{'base':'Nj','letters':/[\u01CB]/g},
{'base':'O', 'letters':/[\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C]/g},
{'base':'OI','letters':/[\u01A2]/g},
{'base':'OO','letters':/[\uA74E]/g},
{'base':'OU','letters':/[\u0222]/g},
{'base':'P', 'letters':/[\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754]/g},
{'base':'Q', 'letters':/[\u0051\u24C6\uFF31\uA756\uA758\u024A]/g},
{'base':'R', 'letters':/[\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782]/g},
{'base':'S', 'letters':/[\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784]/g},
{'base':'T', 'letters':/[\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786]/g},
{'base':'TZ','letters':/[\uA728]/g},
{'base':'U', 'letters':/[\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244]/g},
{'base':'V', 'letters':/[\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245]/g},
{'base':'VY','letters':/[\uA760]/g},
{'base':'W', 'letters':/[\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72]/g},
{'base':'X', 'letters':/[\u0058\u24CD\uFF38\u1E8A\u1E8C]/g},
{'base':'Y', 'letters':/[\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE]/g},
{'base':'Z', 'letters':/[\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762]/g},
{'base':'a', 'letters':/[\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250]/g},
{'base':'aa','letters':/[\uA733]/g},
{'base':'ae','letters':/[\u00E6\u01FD\u01E3]/g},
{'base':'ao','letters':/[\uA735]/g},
{'base':'au','letters':/[\uA737]/g},
{'base':'av','letters':/[\uA739\uA73B]/g},
{'base':'ay','letters':/[\uA73D]/g},
{'base':'b', 'letters':/[\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253]/g},
{'base':'c', 'letters':/[\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184]/g},
{'base':'d', 'letters':/[\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A]/g},
{'base':'dz','letters':/[\u01F3\u01C6]/g},
{'base':'e', 'letters':/[\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD]/g},
{'base':'f', 'letters':/[\u0066\u24D5\uFF46\u1E1F\u0192\uA77C]/g},
{'base':'g', 'letters':/[\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F]/g},
{'base':'h', 'letters':/[\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265]/g},
{'base':'hv','letters':/[\u0195]/g},
{'base':'i', 'letters':/[\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131]/g},
{'base':'j', 'letters':/[\u006A\u24D9\uFF4A\u0135\u01F0\u0249]/g},
{'base':'k', 'letters':/[\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3]/g},
{'base':'l', 'letters':/[\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747]/g},
{'base':'lj','letters':/[\u01C9]/g},
{'base':'m', 'letters':/[\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F]/g},
{'base':'n', 'letters':/[\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5]/g},
{'base':'nj','letters':/[\u01CC]/g},
{'base':'o', 'letters':/[\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275]/g},
{'base':'oi','letters':/[\u01A3]/g},
{'base':'ou','letters':/[\u0223]/g},
{'base':'oo','letters':/[\uA74F]/g},
{'base':'p','letters':/[\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755]/g},
{'base':'q','letters':/[\u0071\u24E0\uFF51\u024B\uA757\uA759]/g},
{'base':'r','letters':/[\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783]/g},
{'base':'s','letters':/[\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B]/g},
{'base':'t','letters':/[\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787]/g},
{'base':'tz','letters':/[\uA729]/g},
{'base':'u','letters':/[\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289]/g},
{'base':'v','letters':/[\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C]/g},
{'base':'vy','letters':/[\uA761]/g},
{'base':'w','letters':/[\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73]/g},
{'base':'x','letters':/[\u0078\u24E7\uFF58\u1E8B\u1E8D]/g},
{'base':'y','letters':/[\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF]/g},
{'base':'z','letters':/[\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763]/g}
];
for(var i=0; i<defaultDiacriticsRemovalMap.length; i++) {
str = str.replace(defaultDiacriticsRemovalMap[i].letters, defaultDiacriticsRemovalMap[i].base);
}
return str;
}
A simple and easy way:
function remove-accents(p){
c='áàãâäéèêëíìîïóòõôöúùûüçÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÖÔÚÙÛÜÇ';s='aaaaaeeeeiiiiooooouuuucAAAAAEEEEIIIIOOOOOUUUUC';n='';for(i=0;i<p.length;i++){if(c.search(p.substr(i,1))>=0){n+=s.substr(c.search(p.substr(i,1)),1);} else{n+=p.substr(i,1);}} return n;
}
So do this:
remove-accents("Thís ís ân accêntéd phráse");
Output:
"This is an accented phrase"
Here is a more complete version based on the Unicode standard, taken from here: http://semplicewebsites.com/removing-accents-javascript
var Latinise={};Latinise.latin_map={"Á":"A",
"A":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"A":"A",
"Â":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"Ä":"A",
"A":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"À":"A",
"?":"A",
"?":"A",
"A":"A",
"A":"A",
"Å":"A",
"?":"A",
"?":"A",
"?":"A",
"Ã":"A",
"?":"AA",
"Æ":"AE",
"?":"AE",
"?":"AE",
"?":"AO",
"?":"AU",
"?":"AV",
"?":"AV",
"?":"AY",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"C":"C",
"C":"C",
"Ç":"C",
"?":"C",
"C":"C",
"C":"C",
"?":"C",
"?":"C",
"D":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"Ð":"D",
"?":"D",
"?":"DZ",
"?":"DZ",
"É":"E",
"E":"E",
"E":"E",
"?":"E",
"?":"E",
"Ê":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"Ë":"E",
"E":"E",
"?":"E",
"?":"E",
"È":"E",
"?":"E",
"?":"E",
"E":"E",
"?":"E",
"?":"E",
"E":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"ET",
"?":"F",
"ƒ":"F",
"?":"G",
"G":"G",
"G":"G",
"G":"G",
"G":"G",
"G":"G",
"?":"G",
"?":"G",
"G":"G",
"?":"H",
"?":"H",
"?":"H",
"H":"H",
"?":"H",
"?":"H",
"?":"H",
"?":"H",
"H":"H",
"Í":"I",
"I":"I",
"I":"I",
"Î":"I",
"Ï":"I",
"?":"I",
"I":"I",
"?":"I",
"?":"I",
"Ì":"I",
"?":"I",
"?":"I",
"I":"I",
"I":"I",
"I":"I",
"I":"I",
"?":"I",
"?":"D",
"?":"F",
"?":"G",
"?":"R",
"?":"S",
"?":"T",
"?":"IS",
"J":"J",
"?":"J",
"?":"K",
"K":"K",
"K":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"L":"L",
"?":"L",
"L":"L",
"L":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"L":"L",
"?":"LJ",
"?":"M",
"?":"M",
"?":"M",
"?":"M",
"N":"N",
"N":"N",
"N":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"Ñ":"N",
"?":"NJ",
"Ó":"O",
"O":"O",
"O":"O",
"Ô":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"Ö":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"O":"O",
"?":"O",
"Ò":"O",
"?":"O",
"O":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"O":"O",
"?":"O",
"?":"O",
"O":"O",
"O":"O",
"O":"O",
"Ø":"O",
"?":"O",
"Õ":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"OI",
"?":"OO",
"?":"E",
"?":"O",
"?":"OU",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"Q",
"?":"Q",
"R":"R",
"R":"R",
"R":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"C",
"?":"E",
"S":"S",
"?":"S",
"Š":"S",
"?":"S",
"S":"S",
"S":"S",
"?":"S",
"?":"S",
"?":"S",
"?":"S",
"T":"T",
"T":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"T":"T",
"T":"T",
"?":"A",
"?":"L",
"?":"M",
"?":"V",
"?":"TZ",
"Ú":"U",
"U":"U",
"U":"U",
"Û":"U",
"?":"U",
"Ü":"U",
"U":"U",
"U":"U",
"U":"U",
"U":"U",
"?":"U",
"?":"U",
"U":"U",
"?":"U",
"Ù":"U",
"?":"U",
"U":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"U":"U",
"?":"U",
"U":"U",
"U":"U",
"U":"U",
"?":"U",
"?":"U",
"?":"V",
"?":"V",
"?":"V",
"?":"V",
"?":"VY",
"?":"W",
"W":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"X",
"?":"X",
"Ý":"Y",
"Y":"Y",
"Ÿ":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"Z":"Z",
"Ž":"Z",
"?":"Z",
"?":"Z",
"Z":"Z",
"?":"Z",
"?":"Z",
"?":"Z",
"?":"Z",
"?":"IJ",
"Œ":"OE",
"?":"A",
"?":"AE",
"?":"B",
"?":"B",
"?":"C",
"?":"D",
"?":"E",
"?":"F",
"?":"G",
"?":"G",
"?":"H",
"?":"I",
"?":"R",
"?":"J",
"?":"K",
"?":"L",
"?":"L",
"?":"M",
"?":"N",
"?":"O",
"?":"OE",
"?":"O",
"?":"OU",
"?":"P",
"?":"R",
"?":"N",
"?":"R",
"?":"S",
"?":"T",
"?":"E",
"?":"R",
"?":"U",
"?":"V",
"?":"W",
"?":"Y",
"?":"Z",
"á":"a",
"a":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"a":"a",
"â":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"ä":"a",
"a":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"à":"a",
"?":"a",
"?":"a",
"a":"a",
"a":"a",
"?":"a",
"?":"a",
"å":"a",
"?":"a",
"?":"a",
"?":"a",
"ã":"a",
"?":"aa",
"æ":"ae",
"?":"ae",
"?":"ae",
"?":"ao",
"?":"au",
"?":"av",
"?":"av",
"?":"ay",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"b":"b",
"?":"b",
"?":"o",
"c":"c",
"c":"c",
"ç":"c",
"?":"c",
"c":"c",
"?":"c",
"c":"c",
"?":"c",
"?":"c",
"d":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"d":"d",
"?":"d",
"?":"d",
"i":"i",
"?":"j",
"?":"j",
"?":"j",
"?":"dz",
"?":"dz",
"é":"e",
"e":"e",
"e":"e",
"?":"e",
"?":"e",
"ê":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"ë":"e",
"e":"e",
"?":"e",
"?":"e",
"è":"e",
"?":"e",
"?":"e",
"e":"e",
"?":"e",
"?":"e",
"?":"e",
"e":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"et",
"?":"f",
"ƒ":"f",
"?":"f",
"?":"f",
"?":"g",
"g":"g",
"g":"g",
"g":"g",
"g":"g",
"g":"g",
"?":"g",
"?":"g",
"?":"g",
"g":"g",
"?":"h",
"?":"h",
"?":"h",
"h":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"h":"h",
"?":"hv",
"í":"i",
"i":"i",
"i":"i",
"î":"i",
"ï":"i",
"?":"i",
"?":"i",
"?":"i",
"ì":"i",
"?":"i",
"?":"i",
"i":"i",
"i":"i",
"?":"i",
"?":"i",
"i":"i",
"?":"i",
"?":"d",
"?":"f",
"?":"g",
"?":"r",
"?":"s",
"?":"t",
"?":"is",
"j":"j",
"j":"j",
"?":"j",
"?":"j",
"?":"k",
"k":"k",
"k":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"l":"l",
"l":"l",
"?":"l",
"l":"l",
"l":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"l":"l",
"?":"lj",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"n":"n",
"n":"n",
"n":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"ñ":"n",
"?":"nj",
"ó":"o",
"o":"o",
"o":"o",
"ô":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"ö":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"o":"o",
"?":"o",
"ò":"o",
"?":"o",
"o":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"o":"o",
"?":"o",
"?":"o",
"o":"o",
"o":"o",
"ø":"o",
"?":"o",
"õ":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"oi",
"?":"oo",
"?":"e",
"?":"e",
"?":"o",
"?":"o",
"?":"ou",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"q",
"?":"q",
"?":"q",
"?":"q",
"r":"r",
"r":"r",
"r":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"c",
"?":"c",
"?":"e",
"?":"r",
"s":"s",
"?":"s",
"š":"s",
"?":"s",
"s":"s",
"s":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"g":"g",
"?":"o",
"?":"o",
"?":"u",
"t":"t",
"t":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"t":"t",
"?":"t",
"t":"t",
"?":"th",
"?":"a",
"?":"ae",
"?":"e",
"?":"g",
"?":"h",
"?":"h",
"?":"h",
"?":"i",
"?":"k",
"?":"l",
"?":"m",
"?":"m",
"?":"oe",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"t",
"?":"v",
"?":"w",
"?":"y",
"?":"tz",
"ú":"u",
"u":"u",
"u":"u",
"û":"u",
"?":"u",
"ü":"u",
"u":"u",
"u":"u",
"u":"u",
"u":"u",
"?":"u",
"?":"u",
"u":"u",
"?":"u",
"ù":"u",
"?":"u",
"u":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"u":"u",
"?":"u",
"u":"u",
"?":"u",
"u":"u",
"u":"u",
"?":"u",
"?":"u",
"?":"ue",
"?":"um",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"vy",
"?":"w",
"w":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"x",
"?":"x",
"?":"x",
"ý":"y",
"y":"y",
"ÿ":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"z":"z",
"ž":"z",
"?":"z",
"?":"z",
"?":"z",
"z":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"z":"z",
"?":"z",
"?":"ff",
"?":"ffi",
"?":"ffl",
"?":"fi",
"?":"fl",
"?":"ij",
"œ":"oe",
"?":"st",
"?":"a",
"?":"e",
"?":"i",
"?":"j",
"?":"o",
"?":"r",
"?":"u",
"?":"v",
"?":"x"};
String.prototype.latinise=function(){return this.replace(/[^A-Za-z0-9\[\] ]/g,function(a){return Latinise.latin_map[a]||a})};
String.prototype.latinize=String.prototype.latinise;
String.prototype.isLatin=function(){return this==this.latinise()}
Some examples:
> "Piqué".latinize();
"Pique"
> "Piqué".isLatin();
false
> "Pique".isLatin();
true
> "Piqué".latinise().isLatin();
true
A direct port to javascript of Kierons solution: https://github.com/rwarasaurus/nano/blob/master/system/helpers.php#L61-73:
/**
* Normalise a string replacing foreign characters
*
* @param {String} str
* @return {String} str
*/
var normalize = (function () {
var a = ['À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ÿ', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'Ð', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', '?', '?', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', '?', '?', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', '?', 'O', 'o', 'O', 'o', 'O', 'o', 'Œ', 'œ', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'Š', 'š', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Ÿ', 'Z', 'z', 'Z', 'z', 'Ž', 'ž', '?', 'ƒ', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', '?', '?', '?', '?', '?', '?'];
var b = ['A', 'A', 'A', 'A', 'A', 'A', 'AE', 'C', 'E', 'E', 'E', 'E', 'I', 'I', 'I', 'I', 'D', 'N', 'O', 'O', 'O', 'O', 'O', 'O', 'U', 'U', 'U', 'U', 'Y', 's', 'a', 'a', 'a', 'a', 'a', 'a', 'ae', 'c', 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 'n', 'o', 'o', 'o', 'o', 'o', 'o', 'u', 'u', 'u', 'u', 'y', 'y', 'A', 'a', 'A', 'a', 'A', 'a', 'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g', 'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i', 'IJ', 'ij', 'J', 'j', 'K', 'k', 'L', 'l', 'L', 'l', 'L', 'l', 'L', 'l', 'l', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', 'O', 'o', 'O', 'o', 'O', 'o', 'OE', 'oe', 'R', 'r', 'R', 'r', 'R', 'r', 'S', 's', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w', 'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's', 'f', 'O', 'o', 'U', 'u', 'A', 'a', 'I', 'i', 'O', 'o', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'A', 'a', 'AE', 'ae', 'O', 'o'];
return function (str) {
var i = a.length;
while (i--) str = str.replace(a[i], b[i]);
return str;
};
}());
And a slightly modified version, using a char-map instead of two arrays:
To compare these two methods I made a simple benchmark: http://jsperf.com/replace-foreign-characters
/**
* Normalise a string replacing foreign characters
*
* @param {String} str
* @return {String}
*/
var normalize = (function () {
var map = {
"À": "A",
"Á": "A",
"Â": "A",
"Ã": "A",
"Ä": "A",
"Å": "A",
"Æ": "AE",
"Ç": "C",
"È": "E",
"É": "E",
"Ê": "E",
"Ë": "E",
"Ì": "I",
"Í": "I",
"Î": "I",
"Ï": "I",
"Ð": "D",
"Ñ": "N",
"Ò": "O",
"Ó": "O",
"Ô": "O",
"Õ": "O",
"Ö": "O",
"Ø": "O",
"Ù": "U",
"Ú": "U",
"Û": "U",
"Ü": "U",
"Ý": "Y",
"ß": "s",
"à": "a",
"á": "a",
"â": "a",
"ã": "a",
"ä": "a",
"å": "a",
"æ": "ae",
"ç": "c",
"è": "e",
"é": "e",
"ê": "e",
"ë": "e",
"ì": "i",
"í": "i",
"î": "i",
"ï": "i",
"ñ": "n",
"ò": "o",
"ó": "o",
"ô": "o",
"õ": "o",
"ö": "o",
"ø": "o",
"ù": "u",
"ú": "u",
"û": "u",
"ü": "u",
"ý": "y",
"ÿ": "y",
"A": "A",
"a": "a",
"A": "A",
"a": "a",
"A": "A",
"a": "a",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"C": "C",
"c": "c",
"D": "D",
"d": "d",
"Ð": "D",
"d": "d",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"E": "E",
"e": "e",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"G": "G",
"g": "g",
"H": "H",
"h": "h",
"H": "H",
"h": "h",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"I": "I",
"i": "i",
"?": "IJ",
"?": "ij",
"J": "J",
"j": "j",
"K": "K",
"k": "k",
"L": "L",
"l": "l",
"L": "L",
"l": "l",
"L": "L",
"l": "l",
"?": "L",
"?": "l",
"L": "l",
"l": "l",
"N": "N",
"n": "n",
"N": "N",
"n": "n",
"N": "N",
"n": "n",
"?": "n",
"O": "O",
"o": "o",
"O": "O",
"o": "o",
"O": "O",
"o": "o",
"Œ": "OE",
"œ": "oe",
"R": "R",
"r": "r",
"R": "R",
"r": "r",
"R": "R",
"r": "r",
"S": "S",
"s": "s",
"S": "S",
"s": "s",
"S": "S",
"s": "s",
"Š": "S",
"š": "s",
"T": "T",
"t": "t",
"T": "T",
"t": "t",
"T": "T",
"t": "t",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"W": "W",
"w": "w",
"Y": "Y",
"y": "y",
"Ÿ": "Y",
"Z": "Z",
"z": "z",
"Z": "Z",
"z": "z",
"Ž": "Z",
"ž": "z",
"?": "s",
"ƒ": "f",
"O": "O",
"o": "o",
"U": "U",
"u": "u",
"A": "A",
"a": "a",
"I": "I",
"i": "i",
"O": "O",
"o": "o",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"U": "U",
"u": "u",
"?": "A",
"?": "a",
"?": "AE",
"?": "ae",
"?": "O",
"?": "o"
},
nonWord = /\W/g,
mapping = function (c) {
return map[c] || c;
};
return function (str) {
return str.replace(nonWord, mapping);
};
}());
Simply should be normalized chain and run a replacement codes:
var str = "Letras Á É Í Ó Ú Ñ - á é í ó ú ñ...";
console.log (str.normalize ("NFKD").replace (/[\u0300-\u036F]/g, ""));
// Letras A E I O U N - a e i o u n...
See normalize
Then you can use this function:
function noTilde (s) {
if (s.normalize != undefined) {
s = s.normalize ("NFKD");
}
return s.replace (/[\u0300-\u036F]/g, "");
}
I think this might work a little cleaner/better (though I haven't test it's performance):
String.prototype.stripAccents = function() {
var translate_re = /[àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ]/g;
var translate = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY';
return (this.replace(translate_re, function(match){
return translate.substr(translate_re.source.indexOf(match)-1, 1); })
);
};
Or if you are still too worried about performance, let's get the best of both worlds:
String.prototype.stripAccents = function() {
var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ',
out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY',
transl = {};
eval('var chars_rgx = /['+in_chrs+']/g');
for(var i = 0; i < in_chrs.length; i++){ transl[in_chrs.charAt(i)] = out_chrs.charAt(i); }
return this.replace(chars_rgx, function(match){
return transl[match]; });
};
EDIT (by @Tomalak)
I appreciate the idea. However, there are several things wrong with the implementation, as outlined in the comment below.
Here is how I would implement it.
var stripAccents = (function () {
var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ',
out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY',
chars_rgx = new RegExp('[' + in_chrs + ']', 'g'),
transl = {}, i,
lookup = function (m) { return transl[m] || m; };
for (i=0; i<in_chrs.length; i++) {
transl[ in_chrs[i] ] = out_chrs[i];
}
return function (s) { return s.replace(chars_rgx, lookup); }
})();
Long time ago I did this in Java and found someone else's solution based on a single string that captures part of the Unicode table that was important for the conversion - the rest was converted to ? or any other replacement character. So I tried to convert it to JavaScript. Mind that I'm no JS expert. :-)
TAB_00C0 = "AAAAAAACEEEEIIII" +
"DNOOOOO*OUUUUYIs" +
"aaaaaaaceeeeiiii" +
"?nooooo/ouuuuy?y" +
"AaAaAaCcCcCcCcDd" +
"DdEeEeEeEeEeGgGg" +
"GgGgHhHhIiIiIiIi" +
"IiJjJjKkkLlLlLlL" +
"lLlNnNnNnnNnOoOo" +
"OoOoRrRrRrSsSsSs" +
"SsTtTtTtUuUuUuUu" +
"UuUuWwYyYZzZzZzF";
function stripDiacritics(source) {
var result = source.split('');
for (var i = 0; i < result.length; i++) {
var c = source.charCodeAt(i);
if (c >= 0x00c0 && c <= 0x017f) {
result[i] = String.fromCharCode(TAB_00C0.charCodeAt(c - 0x00c0));
} else if (c > 127) {
result[i] = '?';
}
}
return result.join('');
}
stripDiacritics("Šupa, co? lštcžýæøåHð")
This converts most of latin1+2 Unicode characters. It is not able to translate single char to multiple. I don't know its performance on JS, in Java this is by far the fastest of common solutions (6-50x), there is no map, there is no regex, nothing. It produces strict ASCII output, potentially with a loss of information, but the size of the output matches the input.
I tested the snippet with http://www.webtoolkitonline.com/javascript-tester.html and it produced Supa, co? lstczyaoa??
as expected.
Here is a more complete version based on the Unicode standard, taken from here: http://semplicewebsites.com/removing-accents-javascript
var Latinise={};Latinise.latin_map={"Á":"A",
"A":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"A":"A",
"Â":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"Ä":"A",
"A":"A",
"?":"A",
"?":"A",
"?":"A",
"?":"A",
"À":"A",
"?":"A",
"?":"A",
"A":"A",
"A":"A",
"Å":"A",
"?":"A",
"?":"A",
"?":"A",
"Ã":"A",
"?":"AA",
"Æ":"AE",
"?":"AE",
"?":"AE",
"?":"AO",
"?":"AU",
"?":"AV",
"?":"AV",
"?":"AY",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"?":"B",
"C":"C",
"C":"C",
"Ç":"C",
"?":"C",
"C":"C",
"C":"C",
"?":"C",
"?":"C",
"D":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"?":"D",
"Ð":"D",
"?":"D",
"?":"DZ",
"?":"DZ",
"É":"E",
"E":"E",
"E":"E",
"?":"E",
"?":"E",
"Ê":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"E",
"Ë":"E",
"E":"E",
"?":"E",
"?":"E",
"È":"E",
"?":"E",
"?":"E",
"E":"E",
"?":"E",
"?":"E",
"E":"E",
"?":"E",
"?":"E",
"?":"E",
"?":"ET",
"?":"F",
"ƒ":"F",
"?":"G",
"G":"G",
"G":"G",
"G":"G",
"G":"G",
"G":"G",
"?":"G",
"?":"G",
"G":"G",
"?":"H",
"?":"H",
"?":"H",
"H":"H",
"?":"H",
"?":"H",
"?":"H",
"?":"H",
"H":"H",
"Í":"I",
"I":"I",
"I":"I",
"Î":"I",
"Ï":"I",
"?":"I",
"I":"I",
"?":"I",
"?":"I",
"Ì":"I",
"?":"I",
"?":"I",
"I":"I",
"I":"I",
"I":"I",
"I":"I",
"?":"I",
"?":"D",
"?":"F",
"?":"G",
"?":"R",
"?":"S",
"?":"T",
"?":"IS",
"J":"J",
"?":"J",
"?":"K",
"K":"K",
"K":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"?":"K",
"L":"L",
"?":"L",
"L":"L",
"L":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"?":"L",
"L":"L",
"?":"LJ",
"?":"M",
"?":"M",
"?":"M",
"?":"M",
"N":"N",
"N":"N",
"N":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"?":"N",
"Ñ":"N",
"?":"NJ",
"Ó":"O",
"O":"O",
"O":"O",
"Ô":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"Ö":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"O":"O",
"?":"O",
"Ò":"O",
"?":"O",
"O":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"O",
"O":"O",
"?":"O",
"?":"O",
"O":"O",
"O":"O",
"O":"O",
"Ø":"O",
"?":"O",
"Õ":"O",
"?":"O",
"?":"O",
"?":"O",
"?":"OI",
"?":"OO",
"?":"E",
"?":"O",
"?":"OU",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"P",
"?":"Q",
"?":"Q",
"R":"R",
"R":"R",
"R":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"R",
"?":"C",
"?":"E",
"S":"S",
"?":"S",
"Š":"S",
"?":"S",
"S":"S",
"S":"S",
"?":"S",
"?":"S",
"?":"S",
"?":"S",
"T":"T",
"T":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"?":"T",
"T":"T",
"T":"T",
"?":"A",
"?":"L",
"?":"M",
"?":"V",
"?":"TZ",
"Ú":"U",
"U":"U",
"U":"U",
"Û":"U",
"?":"U",
"Ü":"U",
"U":"U",
"U":"U",
"U":"U",
"U":"U",
"?":"U",
"?":"U",
"U":"U",
"?":"U",
"Ù":"U",
"?":"U",
"U":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"?":"U",
"U":"U",
"?":"U",
"U":"U",
"U":"U",
"U":"U",
"?":"U",
"?":"U",
"?":"V",
"?":"V",
"?":"V",
"?":"V",
"?":"VY",
"?":"W",
"W":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"W",
"?":"X",
"?":"X",
"Ý":"Y",
"Y":"Y",
"Ÿ":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"?":"Y",
"Z":"Z",
"Ž":"Z",
"?":"Z",
"?":"Z",
"Z":"Z",
"?":"Z",
"?":"Z",
"?":"Z",
"?":"Z",
"?":"IJ",
"Œ":"OE",
"?":"A",
"?":"AE",
"?":"B",
"?":"B",
"?":"C",
"?":"D",
"?":"E",
"?":"F",
"?":"G",
"?":"G",
"?":"H",
"?":"I",
"?":"R",
"?":"J",
"?":"K",
"?":"L",
"?":"L",
"?":"M",
"?":"N",
"?":"O",
"?":"OE",
"?":"O",
"?":"OU",
"?":"P",
"?":"R",
"?":"N",
"?":"R",
"?":"S",
"?":"T",
"?":"E",
"?":"R",
"?":"U",
"?":"V",
"?":"W",
"?":"Y",
"?":"Z",
"á":"a",
"a":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"a":"a",
"â":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"ä":"a",
"a":"a",
"?":"a",
"?":"a",
"?":"a",
"?":"a",
"à":"a",
"?":"a",
"?":"a",
"a":"a",
"a":"a",
"?":"a",
"?":"a",
"å":"a",
"?":"a",
"?":"a",
"?":"a",
"ã":"a",
"?":"aa",
"æ":"ae",
"?":"ae",
"?":"ae",
"?":"ao",
"?":"au",
"?":"av",
"?":"av",
"?":"ay",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"?":"b",
"b":"b",
"?":"b",
"?":"o",
"c":"c",
"c":"c",
"ç":"c",
"?":"c",
"c":"c",
"?":"c",
"c":"c",
"?":"c",
"?":"c",
"d":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"?":"d",
"d":"d",
"?":"d",
"?":"d",
"i":"i",
"?":"j",
"?":"j",
"?":"j",
"?":"dz",
"?":"dz",
"é":"e",
"e":"e",
"e":"e",
"?":"e",
"?":"e",
"ê":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"ë":"e",
"e":"e",
"?":"e",
"?":"e",
"è":"e",
"?":"e",
"?":"e",
"e":"e",
"?":"e",
"?":"e",
"?":"e",
"e":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"e",
"?":"et",
"?":"f",
"ƒ":"f",
"?":"f",
"?":"f",
"?":"g",
"g":"g",
"g":"g",
"g":"g",
"g":"g",
"g":"g",
"?":"g",
"?":"g",
"?":"g",
"g":"g",
"?":"h",
"?":"h",
"?":"h",
"h":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"?":"h",
"h":"h",
"?":"hv",
"í":"i",
"i":"i",
"i":"i",
"î":"i",
"ï":"i",
"?":"i",
"?":"i",
"?":"i",
"ì":"i",
"?":"i",
"?":"i",
"i":"i",
"i":"i",
"?":"i",
"?":"i",
"i":"i",
"?":"i",
"?":"d",
"?":"f",
"?":"g",
"?":"r",
"?":"s",
"?":"t",
"?":"is",
"j":"j",
"j":"j",
"?":"j",
"?":"j",
"?":"k",
"k":"k",
"k":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"?":"k",
"l":"l",
"l":"l",
"?":"l",
"l":"l",
"l":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"?":"l",
"l":"l",
"?":"lj",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"?":"m",
"n":"n",
"n":"n",
"n":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"?":"n",
"ñ":"n",
"?":"nj",
"ó":"o",
"o":"o",
"o":"o",
"ô":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"ö":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"o":"o",
"?":"o",
"ò":"o",
"?":"o",
"o":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"o",
"o":"o",
"?":"o",
"?":"o",
"o":"o",
"o":"o",
"ø":"o",
"?":"o",
"õ":"o",
"?":"o",
"?":"o",
"?":"o",
"?":"oi",
"?":"oo",
"?":"e",
"?":"e",
"?":"o",
"?":"o",
"?":"ou",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"p",
"?":"q",
"?":"q",
"?":"q",
"?":"q",
"r":"r",
"r":"r",
"r":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"c",
"?":"c",
"?":"e",
"?":"r",
"s":"s",
"?":"s",
"š":"s",
"?":"s",
"s":"s",
"s":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"?":"s",
"g":"g",
"?":"o",
"?":"o",
"?":"u",
"t":"t",
"t":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"?":"t",
"t":"t",
"?":"t",
"t":"t",
"?":"th",
"?":"a",
"?":"ae",
"?":"e",
"?":"g",
"?":"h",
"?":"h",
"?":"h",
"?":"i",
"?":"k",
"?":"l",
"?":"m",
"?":"m",
"?":"oe",
"?":"r",
"?":"r",
"?":"r",
"?":"r",
"?":"t",
"?":"v",
"?":"w",
"?":"y",
"?":"tz",
"ú":"u",
"u":"u",
"u":"u",
"û":"u",
"?":"u",
"ü":"u",
"u":"u",
"u":"u",
"u":"u",
"u":"u",
"?":"u",
"?":"u",
"u":"u",
"?":"u",
"ù":"u",
"?":"u",
"u":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"?":"u",
"u":"u",
"?":"u",
"u":"u",
"?":"u",
"u":"u",
"u":"u",
"?":"u",
"?":"u",
"?":"ue",
"?":"um",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"v",
"?":"vy",
"?":"w",
"w":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"w",
"?":"x",
"?":"x",
"?":"x",
"ý":"y",
"y":"y",
"ÿ":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"?":"y",
"z":"z",
"ž":"z",
"?":"z",
"?":"z",
"?":"z",
"z":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"?":"z",
"z":"z",
"?":"z",
"?":"ff",
"?":"ffi",
"?":"ffl",
"?":"fi",
"?":"fl",
"?":"ij",
"œ":"oe",
"?":"st",
"?":"a",
"?":"e",
"?":"i",
"?":"j",
"?":"o",
"?":"r",
"?":"u",
"?":"v",
"?":"x"};
String.prototype.latinise=function(){return this.replace(/[^A-Za-z0-9\[\] ]/g,function(a){return Latinise.latin_map[a]||a})};
String.prototype.latinize=String.prototype.latinise;
String.prototype.isLatin=function(){return this==this.latinise()}
Some examples:
> "Piqué".latinize();
"Pique"
> "Piqué".isLatin();
false
> "Pique".isLatin();
true
> "Piqué".latinise().isLatin();
true
https://stackoverflow.com/a/37511463
With ES2015/ES6 String.Prototype.Normalize(),
const str = "Crème Brulée" str.normalize('NFD').replace(/[\u0300-\u036f]/g, "") > 'Creme Brulee'
Two things are happening here:
normalize()
ing toNFD
Unicode normal form decomposes combined graphemes into the combination of simple ones. Theè
ofCrème
ends up expressed ase
+`
.- Using a regex character class to match the U+0300 ? U+036F range, it is now trivial to
g
lobally get rid of the diacritics, which the Unicode standard conveniently groups as the Combining Diacritical Marks Unicode block.See comment for performance testing.
Alternatively, if you just want sorting
Intl.Collator has sufficient support ~85% right now, a polyfill is also available here but I haven't tested it.
const c = new Intl.Collator(); ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé', 'creme brulay', 'creme brulfé', 'creme bruléa'].sort(c.compare) [ 'crame brulai','creme brulay','creme bruléa','creme brulee', 'crème brulée','creme brulfé','crome brouillé' ] ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé'].sort((a,b) => a>b) ["crame brulai", "creme brulee", "crome brouillé", "crème brulée"]
If you're looking specifically for a way to convert accented characters to non-accented characters, rather than a way to sort accented characters, with a little finagling, the String.localeCompare function can be manipulated to find the basic latin characters that match the extended ones. For example, you might want to produce a human friendly url slug from a page title. If so, you can do something like this:
var baseChars = [];_x000D_
for (var i = 97; i < 97 + 26; i++) {_x000D_
baseChars.push(String.fromCharCode(i));_x000D_
}_x000D_
_x000D_
//if needed, handle fancy compound characters_x000D_
baseChars = baseChars.concat('ss,aa,ae,ao,au,av,ay,dz,hv,lj,nj,oi,ou,oo,tz,vy'.split(','));_x000D_
_x000D_
function isUpperCase(c) { return c !== c.toLocaleLowerCase() }_x000D_
_x000D_
function toBaseChar(c, opts) {_x000D_
opts = opts || {};_x000D_
//if (!('nonAlphaChar' in opts)) opts.nonAlphaChar = '';_x000D_
//if (!('noMatchChar' in opts)) opts.noMatchChar = '';_x000D_
if (!('locale' in opts)) opts.locale = 'en';_x000D_
_x000D_
var cOpts = {sensitivity: 'base'};_x000D_
_x000D_
//exit early for any non-alphabetical character_x000D_
if (c.localeCompare('9', opts.locale, cOpts) <= 0) return opts.nonAlphaChar === undefined ? c : opts.nonAlphaChar;_x000D_
_x000D_
for (var i = 0; i < baseChars.length; i++) {_x000D_
var baseChar = baseChars[i];_x000D_
_x000D_
var comp = c.localeCompare(baseChar, opts.locale, cOpts);_x000D_
if (comp == 0) return (isUpperCase(c)) ? baseChar.toUpperCase() : baseChar;_x000D_
}_x000D_
_x000D_
return opts.noMatchChar === undefined ? c : opts.noMatchChar;_x000D_
}_x000D_
_x000D_
function latinify(str, opts) {_x000D_
return str.replace(/[^\w\s\d]/g, function(c) {_x000D_
return toBaseChar(c, opts);_x000D_
})_x000D_
}_x000D_
_x000D_
// Example:_x000D_
console.log(latinify('Ceština Tsehesenestsestotse Tshiven?a Emigliàn–Rumagnòl Slovenšcina Português Ti?ng Vi?t Straße'))_x000D_
_x000D_
// "Cestina Tsehesenestsestotse Tshivenda Emiglian–Rumagnol Slovenscina Portugues Tieng Viet Strasse"
_x000D_
This should perform quite well, but if further optimization were needed, a binary search could be used with localeCompare
as the comparator to locate the base character. Note that case is preserved, and options allow for either preserving, replacing, or removing characters that aren't alphabetical, or do not have matching latin characters they can be replaced with. This implementation is faster and more flexible, and should work with new characters as they are added. The disadvantage is that compound characters like '?' have to be handled specifically, if they need to be supported.
Answer os Crisalin is almost perfect. Just improved the performance to avoid create new RegExp on each run.
var normalizeConversions = [
{ regex: new RegExp('ä|æ|?', 'g'), clean: 'ae' },
{ regex: new RegExp('ö|œ', 'g'), clean: 'oe' },
{ regex: new RegExp('ü', 'g'), clean: 'ue' },
{ regex: new RegExp('Ä', 'g'), clean: 'Ae' },
{ regex: new RegExp('Ü', 'g'), clean: 'Ue' },
{ regex: new RegExp('Ö', 'g'), clean: 'Oe' },
{ regex: new RegExp('À|Á|Â|Ã|Ä|Å|?|A|A|A|A', 'g'), clean: 'A' },
{ regex: new RegExp('à|á|â|ã|å|?|a|a|a|a|ª', 'g'), clean: 'a' },
{ regex: new RegExp('Ç|C|C|C|C', 'g'), clean: 'C' },
{ regex: new RegExp('ç|c|c|c|c', 'g'), clean: 'c' },
{ regex: new RegExp('Ð|D|Ð', 'g'), clean: 'D' },
{ regex: new RegExp('ð|d|d', 'g'), clean: 'd' },
{ regex: new RegExp('È|É|Ê|Ë|E|E|E|E|E', 'g'), clean: 'E' },
{ regex: new RegExp('è|é|ê|ë|e|e|e|e|e', 'g'), clean: 'e' },
{ regex: new RegExp('G|G|G|G', 'g'), clean: 'G' },
{ regex: new RegExp('g|g|g|g', 'g'), clean: 'g' },
{ regex: new RegExp('H|H', 'g'), clean: 'H' },
{ regex: new RegExp('h|h', 'g'), clean: 'h' },
{ regex: new RegExp('Ì|Í|Î|Ï|I|I|I|I|I|I', 'g'), clean: 'I' },
{ regex: new RegExp('ì|í|î|ï|i|i|i|i|i|i', 'g'), clean: 'i' },
{ regex: new RegExp('J', 'g'), clean: 'J' },
{ regex: new RegExp('j', 'g'), clean: 'j' },
{ regex: new RegExp('K', 'g'), clean: 'K' },
{ regex: new RegExp('k', 'g'), clean: 'k' },
{ regex: new RegExp('L|L|L|?|L', 'g'), clean: 'L' },
{ regex: new RegExp('l|l|l|?|l', 'g'), clean: 'l' },
{ regex: new RegExp('Ñ|N|N|N', 'g'), clean: 'N' },
{ regex: new RegExp('ñ|n|n|n|?', 'g'), clean: 'n' },
{ regex: new RegExp('Ò|Ó|Ô|Õ|O|O|O|O|O|Ø|?', 'g'), clean: 'O' },
{ regex: new RegExp('ò|ó|ô|õ|o|o|o|o|o|ø|?|º', 'g'), clean: 'o' },
{ regex: new RegExp('R|R|R', 'g'), clean: 'R' },
{ regex: new RegExp('r|r|r', 'g'), clean: 'r' },
{ regex: new RegExp('S|S|S|Š', 'g'), clean: 'S' },
{ regex: new RegExp('s|s|s|š|?', 'g'), clean: 's' },
{ regex: new RegExp('T|T|T', 'g'), clean: 'T' },
{ regex: new RegExp('t|t|t', 'g'), clean: 't' },
{ regex: new RegExp('Ù|Ú|Û|U|U|U|U|U|U|U|U|U|U|U|U', 'g'), clean: 'U' },
{ regex: new RegExp('ù|ú|û|u|u|u|u|u|u|u|u|u|u|u|u', 'g'), clean: 'u' },
{ regex: new RegExp('Ý|Ÿ|Y', 'g'), clean: 'Y' },
{ regex: new RegExp('ý|ÿ|y', 'g'), clean: 'y' },
{ regex: new RegExp('W', 'g'), clean: 'W' },
{ regex: new RegExp('w', 'g'), clean: 'w' },
{ regex: new RegExp('Z|Z|Ž', 'g'), clean: 'Z' },
{ regex: new RegExp('z|z|ž', 'g'), clean: 'z' },
{ regex: new RegExp('Æ|?', 'g'), clean: 'AE' },
{ regex: new RegExp('ß', 'g'), clean: 'ss' },
{ regex: new RegExp('?', 'g'), clean: 'IJ' },
{ regex: new RegExp('?', 'g'), clean: 'ij' },
{ regex: new RegExp('Œ', 'g'), clean: 'OE' },
{ regex: new RegExp('ƒ', 'g'), clean: 'f' }
];
Usage:
function(str){
normalizeConversions.forEach(function(normalizeEntry){
str = str.replace(normalizeEntry.regex, normalizeEntry.clean);
});
return str;
};
Based on the solution by Jason Bunting, here is what I use now.
The whole thing is for the jQuery tablesorter plug-in: For (nearly correct) sorting of non-English tables with tablesorter plugin it is necessary to make use of a custom textExtraction
function.
This one:
'dd.mm.yyyy'
) to a recognized format ('yyyy-mm-dd'
)Be careful to save the JavaScript file in UTF-8 encoding or it won't work.
// file encoding must be UTF-8!
function getTextExtractor()
{
return (function() {
var patternLetters = /[öäüÖÄÜáàâéèêúùûóòôÁÀÂÉÈÊÚÙÛÓÒÔß]/g;
var patternDateDmy = /^(?:\D+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})$/;
var lookupLetters = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U",
"á": "a", "à": "a", "â": "a",
"é": "e", "è": "e", "ê": "e",
"ú": "u", "ù": "u", "û": "u",
"ó": "o", "ò": "o", "ô": "o",
"Á": "A", "À": "A", "Â": "A",
"É": "E", "È": "E", "Ê": "E",
"Ú": "U", "Ù": "U", "Û": "U",
"Ó": "O", "Ò": "O", "Ô": "O",
"ß": "s"
};
var letterTranslator = function(match) {
return lookupLetters[match] || match;
}
return function(node) {
var text = $.trim($(node).text());
var date = text.match(patternDateDmy);
if (date)
return [date[3], date[2], date[1]].join("-");
else
return text.replace(patternLetters, letterTranslator);
}
})();
}
You can use it like this:
$("table.sortable").tablesorter({
textExtraction: getTextExtractor()
});
Simply should be normalized chain and run a replacement codes:
var str = "Letras Á É Í Ó Ú Ñ - á é í ó ú ñ...";
console.log (str.normalize ("NFKD").replace (/[\u0300-\u036F]/g, ""));
// Letras A E I O U N - a e i o u n...
See normalize
Then you can use this function:
function noTilde (s) {
if (s.normalize != undefined) {
s = s.normalize ("NFKD");
}
return s.replace (/[\u0300-\u036F]/g, "");
}
I made a Prototype Version of this:
String.prototype.strip = function() {
var translate_re = /[öäüÖÄÜß ]/g;
var translate = {
"ä":"a", "ö":"o", "ü":"u",
"Ä":"A", "Ö":"O", "Ü":"U",
" ":"_", "ß":"ss" // probably more to come
};
return (this.replace(translate_re, function(match){
return translate[match];})
);
};
Use like:
var teststring = 'ä ö ü Ä Ö Ü ß';
teststring.strip();
This will will change the String to a_o_u_A_O_U_ss
https://stackoverflow.com/a/37511463
With ES2015/ES6 String.Prototype.Normalize(),
const str = "Crème Brulée" str.normalize('NFD').replace(/[\u0300-\u036f]/g, "") > 'Creme Brulee'
Two things are happening here:
normalize()
ing toNFD
Unicode normal form decomposes combined graphemes into the combination of simple ones. Theè
ofCrème
ends up expressed ase
+`
.- Using a regex character class to match the U+0300 ? U+036F range, it is now trivial to
g
lobally get rid of the diacritics, which the Unicode standard conveniently groups as the Combining Diacritical Marks Unicode block.See comment for performance testing.
Alternatively, if you just want sorting
Intl.Collator has sufficient support ~85% right now, a polyfill is also available here but I haven't tested it.
const c = new Intl.Collator(); ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé', 'creme brulay', 'creme brulfé', 'creme bruléa'].sort(c.compare) [ 'crame brulai','creme brulay','creme bruléa','creme brulee', 'crème brulée','creme brulfé','crome brouillé' ] ['creme brulee', 'crème brulée', 'crame brulai', 'crome brouillé'].sort((a,b) => a>b) ["crame brulai", "creme brulee", "crome brouillé", "crème brulée"]
Not a single answer mentions String.localeCompare
, which happens to do exactly what you originally wanted, but not what you're asking for.
var list = ['a', 'b', 'c', 'o', 'u', 'z', 'ä', 'ö', 'ü'];
list.sort((a, b) => a.localeCompare(b));
console.log(list);
//Outputs ['a', 'ä', 'b', 'c', 'o', 'ö', 'u', 'ü', 'z']
The second and third parameter are not supported by older browsers though. It's an option worth considering nonetheless.
Basing on existing answers and some suggestions, I've created this one:
String.prototype.removeAccents = function() {
var removalMap = {
'A' : /[A?AÀÁÂ????ÃAA??????ÄA?Å?A??????A]/g,
'AA' : /[?]/g,
'AE' : /[Æ??]/g,
'AO' : /[?]/g,
'AU' : /[?]/g,
'AV' : /[??]/g,
'AY' : /[?]/g,
'B' : /[B?B??????]/g,
'C' : /[C?CCCCCÇ????]/g,
'D' : /[D?D?D????Ð??Ð?]/g,
'DZ' : /[??]/g,
'Dz' : /[??]/g,
'E' : /[E?EÈÉÊ?????E??EEË?E??????E????]/g,
'F' : /[F?F?ƒ?]/g,
'G' : /[G?G?G?GGGGG????]/g,
'H' : /[H?HH??????H???]/g,
'I' : /[I?IÌÍÎIIIIÏ??I???I?I]/g,
'J' : /[J?JJ?]/g,
'K' : /[K?K?K?K???????]/g,
'L' : /[L?L?LL??L??L??????]/g,
'LJ' : /[?]/g,
'Lj' : /[?]/g,
'M' : /[M?M?????]/g,
'N' : /[N?N?NÑ?N?N??????]/g,
'NJ' : /[?]/g,
'Nj' : /[?]/g,
'O' : /[O?OÒÓÔ????Õ???O??O??Ö??OO??O???????OOØ??O??]/g,
'OI' : /[?]/g,
'OO' : /[?]/g,
'OU' : /[?]/g,
'P' : /[P?P???????]/g,
'Q' : /[Q?Q???]/g,
'R' : /[R?RR?R????R??????]/g,
'S' : /[S?S?S?S?Š????S???]/g,
'T' : /[T?T?T??T??T?T??]/g,
'TZ' : /[?]/g,
'U' : /[U?UÙÚÛU?U?UÜUUUU?UUU??U???????U???]/g,
'V' : /[V?V?????]/g,
'VY' : /[?]/g,
'W' : /[W?W??W????]/g,
'X' : /[X?X??]/g,
'Y' : /[Y?Y?ÝY???Ÿ?????]/g,
'Z' : /[Z?ZZ?ZŽ???????]/g,
'a' : /[a?a?àáâ????ãaa??????äa?å?a??????a??]/g,
'aa' : /[?]/g,
'ae' : /[æ??]/g,
'ao' : /[?]/g,
'au' : /[?]/g,
'av' : /[??]/g,
'ay' : /[?]/g,
'b' : /[b?b???b??]/g,
'c' : /[c?cccccç?????]/g,
'd' : /[d?d?d????d????]/g,
'dz' : /[??]/g,
'e' : /[e?eèéê?????e??eeë?e??????e?????]/g,
'f' : /[f?f?ƒ?]/g,
'g' : /[g?g?g?ggggg????]/g,
'h' : /[h?hh???????h???]/g,
'hv' : /[?]/g,
'i' : /[i?iìíîiiiï??i???i??i]/g,
'j' : /[j?jjj?]/g,
'k' : /[k?k?k?k???????]/g,
'l' : /[l?l?ll??l???ll?????]/g,
'lj' : /[?]/g,
'm' : /[m?m?????]/g,
'n' : /[n?n?nñ?n?n???????]/g,
'nj' : /[?]/g,
'o' : /[o?oòóô????õ???o??o??ö??oo??o???????ooø?????]/g,
'oi' : /[?]/g,
'ou' : /[?]/g,
'oo' : /[?]/g,
'p' : /[p?p???????]/g,
'q' : /[q?q???]/g,
'r' : /[r?rr?r????r??????]/g,
's' : /[s?sßs?s?š????s????]/g,
't' : /[t?t??t??t??t????]/g,
'tz' : /[?]/g,
'u' : /[u?uùúûu?u?uüuuuu?uuu??u???????u???]/g,
'v' : /[v?v?????]/g,
'vy' : /[?]/g,
'w' : /[w?w??w?????]/g,
'x' : /[x?x??]/g,
'y' : /[y?y?ýy???ÿ??????]/g,
'z' : /[z?zz?zž??z????]/g,
};
var str = this;
for(var latin in removalMap) {
var nonLatin = removalMap[latin];
str = str.replace(nonLatin , latin);
}
return str;
}
It uses real chars instead of unicode list and works well.
You can use it like
"aaa".removeAccents(); // returns "aaa"
You can easily convert this function to not be string prototype. However, as I'm fan of using string prototype in such cases, you'll have to do it yourself.
If you want to achieve sorting where "ä" comes after "a" and is not treated as the same, then you can use a function like mine.
You can always change the alphabet to get different or even weird sortings. However, if you want some letters to be equivalent, then you have to manipulate the strings like a = a.replace(/ä/, 'a')
or similar, as many have already replied above. I've included the uppercase letters if someone wants to have all uppercase words before all lowercase words (then you have to ommit .toLowerCase()
).
function sortbyalphabet(a,b) {
alphabet = "0123456789AaÀàÁáÂâÃãÄäBbCcÇçDdÈèÉéÊêËëFfGgHhÌìÍíÎîÏïJjKkLlMmNnÑñOoÒòÓóÔôÕõÖöPpQqRrSsTtÙùÚúÛûÜüVvWwXxÝýŸÿZz";
a = a.toLowerCase();
b = b.toLowerCase();
shorterone = (a.length > b.length ? a : b);
for (i=0; i<shorterone.length; i++){
diff = alphabet.indexOf(a.charAt(i)) - alphabet.indexOf(b.charAt(i));
if (diff!=0){
return diff;
}
}
// sort the shorter first
return a.length - b.length;
}
var n = ["ast", "Äste", "apfel", "äpfel", "à"];
console.log(n.sort(sortbyalphabet));
// should return ["apfel", "ast", "à", "äpfel", "äste"]
I can't think about an easier way to efficiently remove all diacritics from a string than using this amazing solution.
See it in action:
var string = "öäüÖÄÜ";_x000D_
_x000D_
var string_norm = string.normalize('NFD').replace(/[\u0300-\u036f]/g, '');_x000D_
console.log(string_norm);
_x000D_
I've solved it another way, if you like.
Here I used two arrays where searchChars containing which will be replaced and replaceChars containing desired characters.
var text = "your input string";_x000D_
var searchChars = ['Å','Ä','å','Ö','ö']; // add more charecter._x000D_
var replaceChars = ['A','A','a','O','o']; // exact same index to searchChars._x000D_
var index;_x000D_
for (var i = 0; i < text.length; i++) {_x000D_
if( $.inArray(text[i], searchChars) >-1 ){ // $.inArray() is from jquery._x000D_
index = searchChars.indexOf(text[i]);_x000D_
text = text.slice(0, i) + replaceChars[index] + text.slice(i+1,text.length);_x000D_
}_x000D_
}
_x000D_
I've solved it another way, if you like.
Here I used two arrays where searchChars containing which will be replaced and replaceChars containing desired characters.
var text = "your input string";_x000D_
var searchChars = ['Å','Ä','å','Ö','ö']; // add more charecter._x000D_
var replaceChars = ['A','A','a','O','o']; // exact same index to searchChars._x000D_
var index;_x000D_
for (var i = 0; i < text.length; i++) {_x000D_
if( $.inArray(text[i], searchChars) >-1 ){ // $.inArray() is from jquery._x000D_
index = searchChars.indexOf(text[i]);_x000D_
text = text.slice(0, i) + replaceChars[index] + text.slice(i+1,text.length);_x000D_
}_x000D_
}
_x000D_
I think this might work a little cleaner/better (though I haven't test it's performance):
String.prototype.stripAccents = function() {
var translate_re = /[àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ]/g;
var translate = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY';
return (this.replace(translate_re, function(match){
return translate.substr(translate_re.source.indexOf(match)-1, 1); })
);
};
Or if you are still too worried about performance, let's get the best of both worlds:
String.prototype.stripAccents = function() {
var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ',
out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY',
transl = {};
eval('var chars_rgx = /['+in_chrs+']/g');
for(var i = 0; i < in_chrs.length; i++){ transl[in_chrs.charAt(i)] = out_chrs.charAt(i); }
return this.replace(chars_rgx, function(match){
return transl[match]; });
};
EDIT (by @Tomalak)
I appreciate the idea. However, there are several things wrong with the implementation, as outlined in the comment below.
Here is how I would implement it.
var stripAccents = (function () {
var in_chrs = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ',
out_chrs = 'aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY',
chars_rgx = new RegExp('[' + in_chrs + ']', 'g'),
transl = {}, i,
lookup = function (m) { return transl[m] || m; };
for (i=0; i<in_chrs.length; i++) {
transl[ in_chrs[i] ] = out_chrs[i];
}
return function (s) { return s.replace(chars_rgx, lookup); }
})();
I made a Prototype Version of this:
String.prototype.strip = function() {
var translate_re = /[öäüÖÄÜß ]/g;
var translate = {
"ä":"a", "ö":"o", "ü":"u",
"Ä":"A", "Ö":"O", "Ü":"U",
" ":"_", "ß":"ss" // probably more to come
};
return (this.replace(translate_re, function(match){
return translate[match];})
);
};
Use like:
var teststring = 'ä ö ü Ä Ö Ü ß';
teststring.strip();
This will will change the String to a_o_u_A_O_U_ss
The complete solution to your request is:
function convert_accented_characters(str){
var conversions = new Object();
conversions['ae'] = 'ä|æ|?';
conversions['oe'] = 'ö|œ';
conversions['ue'] = 'ü';
conversions['Ae'] = 'Ä';
conversions['Ue'] = 'Ü';
conversions['Oe'] = 'Ö';
conversions['A'] = 'À|Á|Â|Ã|Ä|Å|?|A|A|A|A';
conversions['a'] = 'à|á|â|ã|å|?|a|a|a|a|ª';
conversions['C'] = 'Ç|C|C|C|C';
conversions['c'] = 'ç|c|c|c|c';
conversions['D'] = 'Ð|D|Ð';
conversions['d'] = 'ð|d|d';
conversions['E'] = 'È|É|Ê|Ë|E|E|E|E|E';
conversions['e'] = 'è|é|ê|ë|e|e|e|e|e';
conversions['G'] = 'G|G|G|G';
conversions['g'] = 'g|g|g|g';
conversions['H'] = 'H|H';
conversions['h'] = 'h|h';
conversions['I'] = 'Ì|Í|Î|Ï|I|I|I|I|I|I';
conversions['i'] = 'ì|í|î|ï|i|i|i|i|i|i';
conversions['J'] = 'J';
conversions['j'] = 'j';
conversions['K'] = 'K';
conversions['k'] = 'k';
conversions['L'] = 'L|L|L|?|L';
conversions['l'] = 'l|l|l|?|l';
conversions['N'] = 'Ñ|N|N|N';
conversions['n'] = 'ñ|n|n|n|?';
conversions['O'] = 'Ò|Ó|Ô|Õ|O|O|O|O|O|Ø|?';
conversions['o'] = 'ò|ó|ô|õ|o|o|o|o|o|ø|?|º';
conversions['R'] = 'R|R|R';
conversions['r'] = 'r|r|r';
conversions['S'] = 'S|S|S|Š';
conversions['s'] = 's|s|s|š|?';
conversions['T'] = 'T|T|T';
conversions['t'] = 't|t|t';
conversions['U'] = 'Ù|Ú|Û|U|U|U|U|U|U|U|U|U|U|U|U';
conversions['u'] = 'ù|ú|û|u|u|u|u|u|u|u|u|u|u|u|u';
conversions['Y'] = 'Ý|Ÿ|Y';
conversions['y'] = 'ý|ÿ|y';
conversions['W'] = 'W';
conversions['w'] = 'w';
conversions['Z'] = 'Z|Z|Ž';
conversions['z'] = 'z|z|ž';
conversions['AE'] = 'Æ|?';
conversions['ss'] = 'ß';
conversions['IJ'] = '?';
conversions['ij'] = '?';
conversions['OE'] = 'Œ';
conversions['f'] = 'ƒ';
for(var i in conversions){
var re = new RegExp(conversions[i],"g");
str = str.replace(re,i);
}
return str;
}
Answer os Crisalin is almost perfect. Just improved the performance to avoid create new RegExp on each run.
var normalizeConversions = [
{ regex: new RegExp('ä|æ|?', 'g'), clean: 'ae' },
{ regex: new RegExp('ö|œ', 'g'), clean: 'oe' },
{ regex: new RegExp('ü', 'g'), clean: 'ue' },
{ regex: new RegExp('Ä', 'g'), clean: 'Ae' },
{ regex: new RegExp('Ü', 'g'), clean: 'Ue' },
{ regex: new RegExp('Ö', 'g'), clean: 'Oe' },
{ regex: new RegExp('À|Á|Â|Ã|Ä|Å|?|A|A|A|A', 'g'), clean: 'A' },
{ regex: new RegExp('à|á|â|ã|å|?|a|a|a|a|ª', 'g'), clean: 'a' },
{ regex: new RegExp('Ç|C|C|C|C', 'g'), clean: 'C' },
{ regex: new RegExp('ç|c|c|c|c', 'g'), clean: 'c' },
{ regex: new RegExp('Ð|D|Ð', 'g'), clean: 'D' },
{ regex: new RegExp('ð|d|d', 'g'), clean: 'd' },
{ regex: new RegExp('È|É|Ê|Ë|E|E|E|E|E', 'g'), clean: 'E' },
{ regex: new RegExp('è|é|ê|ë|e|e|e|e|e', 'g'), clean: 'e' },
{ regex: new RegExp('G|G|G|G', 'g'), clean: 'G' },
{ regex: new RegExp('g|g|g|g', 'g'), clean: 'g' },
{ regex: new RegExp('H|H', 'g'), clean: 'H' },
{ regex: new RegExp('h|h', 'g'), clean: 'h' },
{ regex: new RegExp('Ì|Í|Î|Ï|I|I|I|I|I|I', 'g'), clean: 'I' },
{ regex: new RegExp('ì|í|î|ï|i|i|i|i|i|i', 'g'), clean: 'i' },
{ regex: new RegExp('J', 'g'), clean: 'J' },
{ regex: new RegExp('j', 'g'), clean: 'j' },
{ regex: new RegExp('K', 'g'), clean: 'K' },
{ regex: new RegExp('k', 'g'), clean: 'k' },
{ regex: new RegExp('L|L|L|?|L', 'g'), clean: 'L' },
{ regex: new RegExp('l|l|l|?|l', 'g'), clean: 'l' },
{ regex: new RegExp('Ñ|N|N|N', 'g'), clean: 'N' },
{ regex: new RegExp('ñ|n|n|n|?', 'g'), clean: 'n' },
{ regex: new RegExp('Ò|Ó|Ô|Õ|O|O|O|O|O|Ø|?', 'g'), clean: 'O' },
{ regex: new RegExp('ò|ó|ô|õ|o|o|o|o|o|ø|?|º', 'g'), clean: 'o' },
{ regex: new RegExp('R|R|R', 'g'), clean: 'R' },
{ regex: new RegExp('r|r|r', 'g'), clean: 'r' },
{ regex: new RegExp('S|S|S|Š', 'g'), clean: 'S' },
{ regex: new RegExp('s|s|s|š|?', 'g'), clean: 's' },
{ regex: new RegExp('T|T|T', 'g'), clean: 'T' },
{ regex: new RegExp('t|t|t', 'g'), clean: 't' },
{ regex: new RegExp('Ù|Ú|Û|U|U|U|U|U|U|U|U|U|U|U|U', 'g'), clean: 'U' },
{ regex: new RegExp('ù|ú|û|u|u|u|u|u|u|u|u|u|u|u|u', 'g'), clean: 'u' },
{ regex: new RegExp('Ý|Ÿ|Y', 'g'), clean: 'Y' },
{ regex: new RegExp('ý|ÿ|y', 'g'), clean: 'y' },
{ regex: new RegExp('W', 'g'), clean: 'W' },
{ regex: new RegExp('w', 'g'), clean: 'w' },
{ regex: new RegExp('Z|Z|Ž', 'g'), clean: 'Z' },
{ regex: new RegExp('z|z|ž', 'g'), clean: 'z' },
{ regex: new RegExp('Æ|?', 'g'), clean: 'AE' },
{ regex: new RegExp('ß', 'g'), clean: 'ss' },
{ regex: new RegExp('?', 'g'), clean: 'IJ' },
{ regex: new RegExp('?', 'g'), clean: 'ij' },
{ regex: new RegExp('Œ', 'g'), clean: 'OE' },
{ regex: new RegExp('ƒ', 'g'), clean: 'f' }
];
Usage:
function(str){
normalizeConversions.forEach(function(normalizeEntry){
str = str.replace(normalizeEntry.regex, normalizeEntry.clean);
});
return str;
};
For the lads using TypeScript and those who don't want to deal with string prototypes, here is a typescript version of Ed.'s answer:
// Usage example:
"Some string".replace(/[^a-zA-Z0-9-_]/g, char => ToLatinMap.get(char) || '')
// Map:
export let ToLatinMap: Map<string, string> = new Map<string, string>([
["Á", "A"],
["A", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["A", "A"],
["Â", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["Ä", "A"],
["A", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["À", "A"],
["?", "A"],
["?", "A"],
["A", "A"],
["A", "A"],
["Å", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["Ã", "A"],
["?", "AA"],
["Æ", "AE"],
["?", "AE"],
["?", "AE"],
["?", "AO"],
["?", "AU"],
["?", "AV"],
["?", "AV"],
["?", "AY"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["C", "C"],
["C", "C"],
["Ç", "C"],
["?", "C"],
["C", "C"],
["C", "C"],
["?", "C"],
["?", "C"],
["D", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["Ð", "D"],
["?", "D"],
["?", "DZ"],
["?", "DZ"],
["É", "E"],
["E", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["Ê", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["Ë", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["È", "E"],
["?", "E"],
["?", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "ET"],
["?", "F"],
["ƒ", "F"],
["?", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["?", "G"],
["?", "G"],
["G", "G"],
["?", "H"],
["?", "H"],
["?", "H"],
["H", "H"],
["?", "H"],
["?", "H"],
["?", "H"],
["?", "H"],
["H", "H"],
["Í", "I"],
["I", "I"],
["I", "I"],
["Î", "I"],
["Ï", "I"],
["?", "I"],
["I", "I"],
["?", "I"],
["?", "I"],
["Ì", "I"],
["?", "I"],
["?", "I"],
["I", "I"],
["I", "I"],
["I", "I"],
["I", "I"],
["?", "I"],
["?", "D"],
["?", "F"],
["?", "G"],
["?", "R"],
["?", "S"],
["?", "T"],
["?", "IS"],
["J", "J"],
["?", "J"],
["?", "K"],
["K", "K"],
["K", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["L", "L"],
["?", "L"],
["L", "L"],
["L", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["L", "L"],
["?", "LJ"],
["?", "M"],
["?", "M"],
["?", "M"],
["?", "M"],
["N", "N"],
["N", "N"],
["N", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["Ñ", "N"],
["?", "NJ"],
["Ó", "O"],
["O", "O"],
["O", "O"],
["Ô", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["Ö", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["Ò", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["O", "O"],
["O", "O"],
["Ø", "O"],
["?", "O"],
["Õ", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "OI"],
["?", "OO"],
["?", "E"],
["?", "O"],
["?", "OU"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "Q"],
["?", "Q"],
["R", "R"],
["R", "R"],
["R", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "C"],
["?", "E"],
["S", "S"],
["?", "S"],
["Š", "S"],
["?", "S"],
["S", "S"],
["S", "S"],
["?", "S"],
["?", "S"],
["?", "S"],
["?", "S"],
["T", "T"],
["T", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["T", "T"],
["T", "T"],
["?", "A"],
["?", "L"],
["?", "M"],
["?", "V"],
["?", "TZ"],
["Ú", "U"],
["U", "U"],
["U", "U"],
["Û", "U"],
["?", "U"],
["Ü", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["Ù", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["?", "V"],
["?", "V"],
["?", "V"],
["?", "V"],
["?", "VY"],
["?", "W"],
["W", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "X"],
["?", "X"],
["Ý", "Y"],
["Y", "Y"],
["Ÿ", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["Z", "Z"],
["Ž", "Z"],
["?", "Z"],
["?", "Z"],
["Z", "Z"],
["?", "Z"],
["?", "Z"],
["?", "Z"],
["?", "Z"],
["?", "IJ"],
["Œ", "OE"],
["?", "A"],
["?", "AE"],
["?", "B"],
["?", "B"],
["?", "C"],
["?", "D"],
["?", "E"],
["?", "F"],
["?", "G"],
["?", "G"],
["?", "H"],
["?", "I"],
["?", "R"],
["?", "J"],
["?", "K"],
["?", "L"],
["?", "L"],
["?", "M"],
["?", "N"],
["?", "O"],
["?", "OE"],
["?", "O"],
["?", "OU"],
["?", "P"],
["?", "R"],
["?", "N"],
["?", "R"],
["?", "S"],
["?", "T"],
["?", "E"],
["?", "R"],
["?", "U"],
["?", "V"],
["?", "W"],
["?", "Y"],
["?", "Z"],
["á", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["a", "a"],
["â", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["ä", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["à", "a"],
["?", "a"],
["?", "a"],
["a", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["å", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["ã", "a"],
["?", "aa"],
["æ", "ae"],
["?", "ae"],
["?", "ae"],
["?", "ao"],
["?", "au"],
["?", "av"],
["?", "av"],
["?", "ay"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["b", "b"],
["?", "b"],
["?", "o"],
["c", "c"],
["c", "c"],
["ç", "c"],
["?", "c"],
["c", "c"],
["?", "c"],
["c", "c"],
["?", "c"],
["?", "c"],
["d", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["d", "d"],
["?", "d"],
["?", "d"],
["i", "i"],
["?", "j"],
["?", "j"],
["?", "j"],
["?", "dz"],
["?", "dz"],
["é", "e"],
["e", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["ê", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["ë", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["è", "e"],
["?", "e"],
["?", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "et"],
["?", "f"],
["ƒ", "f"],
["?", "f"],
["?", "f"],
["?", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["?", "g"],
["?", "g"],
["?", "g"],
["g", "g"],
["?", "h"],
["?", "h"],
["?", "h"],
["h", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["h", "h"],
["?", "hv"],
["í", "i"],
["i", "i"],
["i", "i"],
["î", "i"],
["ï", "i"],
["?", "i"],
["?", "i"],
["?", "i"],
["ì", "i"],
["?", "i"],
["?", "i"],
["i", "i"],
["i", "i"],
["?", "i"],
["?", "i"],
["i", "i"],
["?", "i"],
["?", "d"],
["?", "f"],
["?", "g"],
["?", "r"],
["?", "s"],
["?", "t"],
["?", "is"],
["j", "j"],
["j", "j"],
["?", "j"],
["?", "j"],
["?", "k"],
["k", "k"],
["k", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["l", "l"],
["l", "l"],
["?", "l"],
["l", "l"],
["l", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["l", "l"],
["?", "lj"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["n", "n"],
["n", "n"],
["n", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["ñ", "n"],
["?", "nj"],
["ó", "o"],
["o", "o"],
["o", "o"],
["ô", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["ö", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["ò", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["o", "o"],
["ø", "o"],
["?", "o"],
["õ", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "oi"],
["?", "oo"],
["?", "e"],
["?", "e"],
["?", "o"],
["?", "o"],
["?", "ou"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "q"],
["?", "q"],
["?", "q"],
["?", "q"],
["r", "r"],
["r", "r"],
["r", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "c"],
["?", "c"],
["?", "e"],
["?", "r"],
["s", "s"],
["?", "s"],
["š", "s"],
["?", "s"],
["s", "s"],
["s", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["g", "g"],
["?", "o"],
["?", "o"],
["?", "u"],
["t", "t"],
["t", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["t", "t"],
["?", "t"],
["t", "t"],
["?", "th"],
["?", "a"],
["?", "ae"],
["?", "e"],
["?", "g"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "i"],
["?", "k"],
["?", "l"],
["?", "m"],
["?", "m"],
["?", "oe"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "t"],
["?", "v"],
["?", "w"],
["?", "y"],
["?", "tz"],
["ú", "u"],
["u", "u"],
["u", "u"],
["û", "u"],
["?", "u"],
["ü", "u"],
["u", "u"],
["u", "u"],
["u", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["ù", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["u", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["?", "ue"],
["?", "um"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "vy"],
["?", "w"],
["w", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "x"],
["?", "x"],
["?", "x"],
["ý", "y"],
["y", "y"],
["ÿ", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["z", "z"],
["ž", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["z", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["z", "z"],
["?", "z"],
["?", "ff"],
["?", "ffi"],
["?", "ffl"],
["?", "fi"],
["?", "fl"],
["?", "ij"],
["œ", "oe"],
["?", "st"],
["?", "a"],
["?", "e"],
["?", "i"],
["?", "j"],
["?", "o"],
["?", "r"],
["?", "u"],
["?", "v"],
["?", "x"],
]);
I can't think about an easier way to efficiently remove all diacritics from a string than using this amazing solution.
See it in action:
var string = "öäüÖÄÜ";_x000D_
_x000D_
var string_norm = string.normalize('NFD').replace(/[\u0300-\u036f]/g, '');_x000D_
console.log(string_norm);
_x000D_
If you want to achieve sorting where "ä" comes after "a" and is not treated as the same, then you can use a function like mine.
You can always change the alphabet to get different or even weird sortings. However, if you want some letters to be equivalent, then you have to manipulate the strings like a = a.replace(/ä/, 'a')
or similar, as many have already replied above. I've included the uppercase letters if someone wants to have all uppercase words before all lowercase words (then you have to ommit .toLowerCase()
).
function sortbyalphabet(a,b) {
alphabet = "0123456789AaÀàÁáÂâÃãÄäBbCcÇçDdÈèÉéÊêËëFfGgHhÌìÍíÎîÏïJjKkLlMmNnÑñOoÒòÓóÔôÕõÖöPpQqRrSsTtÙùÚúÛûÜüVvWwXxÝýŸÿZz";
a = a.toLowerCase();
b = b.toLowerCase();
shorterone = (a.length > b.length ? a : b);
for (i=0; i<shorterone.length; i++){
diff = alphabet.indexOf(a.charAt(i)) - alphabet.indexOf(b.charAt(i));
if (diff!=0){
return diff;
}
}
// sort the shorter first
return a.length - b.length;
}
var n = ["ast", "Äste", "apfel", "äpfel", "à"];
console.log(n.sort(sortbyalphabet));
// should return ["apfel", "ast", "à", "äpfel", "äste"]
A simple and easy way:
function remove-accents(p){
c='áàãâäéèêëíìîïóòõôöúùûüçÁÀÃÂÄÉÈÊËÍÌÎÏÓÒÕÖÔÚÙÛÜÇ';s='aaaaaeeeeiiiiooooouuuucAAAAAEEEEIIIIOOOOOUUUUC';n='';for(i=0;i<p.length;i++){if(c.search(p.substr(i,1))>=0){n+=s.substr(c.search(p.substr(i,1)),1);} else{n+=p.substr(i,1);}} return n;
}
So do this:
remove-accents("Thís ís ân accêntéd phráse");
Output:
"This is an accented phrase"
I just wanted to post my solution using String#localeCompare
const base_chars = [_x000D_
'1', '2', '3', '4', '5', '6', '7', '8', '9',_x000D_
'0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',_x000D_
'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',_x000D_
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',_x000D_
'-', '_', ' '_x000D_
];_x000D_
const fix = str => str.normalize('NFKD').split('')_x000D_
.map(c => base_chars.find(bc => bc.localeCompare(c, 'en', { sensitivity: 'base' })==0))_x000D_
.join('');_x000D_
_x000D_
const str = 'OÒ óëå-123';_x000D_
console.log(`fix(${str}) = ${fix(str)}`);
_x000D_
For the lads using TypeScript and those who don't want to deal with string prototypes, here is a typescript version of Ed.'s answer:
// Usage example:
"Some string".replace(/[^a-zA-Z0-9-_]/g, char => ToLatinMap.get(char) || '')
// Map:
export let ToLatinMap: Map<string, string> = new Map<string, string>([
["Á", "A"],
["A", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["A", "A"],
["Â", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["Ä", "A"],
["A", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["À", "A"],
["?", "A"],
["?", "A"],
["A", "A"],
["A", "A"],
["Å", "A"],
["?", "A"],
["?", "A"],
["?", "A"],
["Ã", "A"],
["?", "AA"],
["Æ", "AE"],
["?", "AE"],
["?", "AE"],
["?", "AO"],
["?", "AU"],
["?", "AV"],
["?", "AV"],
["?", "AY"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["?", "B"],
["C", "C"],
["C", "C"],
["Ç", "C"],
["?", "C"],
["C", "C"],
["C", "C"],
["?", "C"],
["?", "C"],
["D", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["?", "D"],
["Ð", "D"],
["?", "D"],
["?", "DZ"],
["?", "DZ"],
["É", "E"],
["E", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["Ê", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["Ë", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["È", "E"],
["?", "E"],
["?", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["E", "E"],
["?", "E"],
["?", "E"],
["?", "E"],
["?", "ET"],
["?", "F"],
["ƒ", "F"],
["?", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["G", "G"],
["?", "G"],
["?", "G"],
["G", "G"],
["?", "H"],
["?", "H"],
["?", "H"],
["H", "H"],
["?", "H"],
["?", "H"],
["?", "H"],
["?", "H"],
["H", "H"],
["Í", "I"],
["I", "I"],
["I", "I"],
["Î", "I"],
["Ï", "I"],
["?", "I"],
["I", "I"],
["?", "I"],
["?", "I"],
["Ì", "I"],
["?", "I"],
["?", "I"],
["I", "I"],
["I", "I"],
["I", "I"],
["I", "I"],
["?", "I"],
["?", "D"],
["?", "F"],
["?", "G"],
["?", "R"],
["?", "S"],
["?", "T"],
["?", "IS"],
["J", "J"],
["?", "J"],
["?", "K"],
["K", "K"],
["K", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["?", "K"],
["L", "L"],
["?", "L"],
["L", "L"],
["L", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["?", "L"],
["L", "L"],
["?", "LJ"],
["?", "M"],
["?", "M"],
["?", "M"],
["?", "M"],
["N", "N"],
["N", "N"],
["N", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["?", "N"],
["Ñ", "N"],
["?", "NJ"],
["Ó", "O"],
["O", "O"],
["O", "O"],
["Ô", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["Ö", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["Ò", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["?", "O"],
["?", "O"],
["O", "O"],
["O", "O"],
["O", "O"],
["Ø", "O"],
["?", "O"],
["Õ", "O"],
["?", "O"],
["?", "O"],
["?", "O"],
["?", "OI"],
["?", "OO"],
["?", "E"],
["?", "O"],
["?", "OU"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "P"],
["?", "Q"],
["?", "Q"],
["R", "R"],
["R", "R"],
["R", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "R"],
["?", "C"],
["?", "E"],
["S", "S"],
["?", "S"],
["Š", "S"],
["?", "S"],
["S", "S"],
["S", "S"],
["?", "S"],
["?", "S"],
["?", "S"],
["?", "S"],
["T", "T"],
["T", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["?", "T"],
["T", "T"],
["T", "T"],
["?", "A"],
["?", "L"],
["?", "M"],
["?", "V"],
["?", "TZ"],
["Ú", "U"],
["U", "U"],
["U", "U"],
["Û", "U"],
["?", "U"],
["Ü", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["Ù", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["?", "U"],
["U", "U"],
["?", "U"],
["U", "U"],
["U", "U"],
["U", "U"],
["?", "U"],
["?", "U"],
["?", "V"],
["?", "V"],
["?", "V"],
["?", "V"],
["?", "VY"],
["?", "W"],
["W", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "W"],
["?", "X"],
["?", "X"],
["Ý", "Y"],
["Y", "Y"],
["Ÿ", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["?", "Y"],
["Z", "Z"],
["Ž", "Z"],
["?", "Z"],
["?", "Z"],
["Z", "Z"],
["?", "Z"],
["?", "Z"],
["?", "Z"],
["?", "Z"],
["?", "IJ"],
["Œ", "OE"],
["?", "A"],
["?", "AE"],
["?", "B"],
["?", "B"],
["?", "C"],
["?", "D"],
["?", "E"],
["?", "F"],
["?", "G"],
["?", "G"],
["?", "H"],
["?", "I"],
["?", "R"],
["?", "J"],
["?", "K"],
["?", "L"],
["?", "L"],
["?", "M"],
["?", "N"],
["?", "O"],
["?", "OE"],
["?", "O"],
["?", "OU"],
["?", "P"],
["?", "R"],
["?", "N"],
["?", "R"],
["?", "S"],
["?", "T"],
["?", "E"],
["?", "R"],
["?", "U"],
["?", "V"],
["?", "W"],
["?", "Y"],
["?", "Z"],
["á", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["a", "a"],
["â", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["ä", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["à", "a"],
["?", "a"],
["?", "a"],
["a", "a"],
["a", "a"],
["?", "a"],
["?", "a"],
["å", "a"],
["?", "a"],
["?", "a"],
["?", "a"],
["ã", "a"],
["?", "aa"],
["æ", "ae"],
["?", "ae"],
["?", "ae"],
["?", "ao"],
["?", "au"],
["?", "av"],
["?", "av"],
["?", "ay"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["?", "b"],
["b", "b"],
["?", "b"],
["?", "o"],
["c", "c"],
["c", "c"],
["ç", "c"],
["?", "c"],
["c", "c"],
["?", "c"],
["c", "c"],
["?", "c"],
["?", "c"],
["d", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["?", "d"],
["d", "d"],
["?", "d"],
["?", "d"],
["i", "i"],
["?", "j"],
["?", "j"],
["?", "j"],
["?", "dz"],
["?", "dz"],
["é", "e"],
["e", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["ê", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["ë", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["è", "e"],
["?", "e"],
["?", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["e", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "e"],
["?", "et"],
["?", "f"],
["ƒ", "f"],
["?", "f"],
["?", "f"],
["?", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["g", "g"],
["?", "g"],
["?", "g"],
["?", "g"],
["g", "g"],
["?", "h"],
["?", "h"],
["?", "h"],
["h", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "h"],
["h", "h"],
["?", "hv"],
["í", "i"],
["i", "i"],
["i", "i"],
["î", "i"],
["ï", "i"],
["?", "i"],
["?", "i"],
["?", "i"],
["ì", "i"],
["?", "i"],
["?", "i"],
["i", "i"],
["i", "i"],
["?", "i"],
["?", "i"],
["i", "i"],
["?", "i"],
["?", "d"],
["?", "f"],
["?", "g"],
["?", "r"],
["?", "s"],
["?", "t"],
["?", "is"],
["j", "j"],
["j", "j"],
["?", "j"],
["?", "j"],
["?", "k"],
["k", "k"],
["k", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["?", "k"],
["l", "l"],
["l", "l"],
["?", "l"],
["l", "l"],
["l", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["?", "l"],
["l", "l"],
["?", "lj"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["?", "m"],
["n", "n"],
["n", "n"],
["n", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["?", "n"],
["ñ", "n"],
["?", "nj"],
["ó", "o"],
["o", "o"],
["o", "o"],
["ô", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["ö", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["ò", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["?", "o"],
["?", "o"],
["o", "o"],
["o", "o"],
["ø", "o"],
["?", "o"],
["õ", "o"],
["?", "o"],
["?", "o"],
["?", "o"],
["?", "oi"],
["?", "oo"],
["?", "e"],
["?", "e"],
["?", "o"],
["?", "o"],
["?", "ou"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "p"],
["?", "q"],
["?", "q"],
["?", "q"],
["?", "q"],
["r", "r"],
["r", "r"],
["r", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "c"],
["?", "c"],
["?", "e"],
["?", "r"],
["s", "s"],
["?", "s"],
["š", "s"],
["?", "s"],
["s", "s"],
["s", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["?", "s"],
["g", "g"],
["?", "o"],
["?", "o"],
["?", "u"],
["t", "t"],
["t", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["?", "t"],
["t", "t"],
["?", "t"],
["t", "t"],
["?", "th"],
["?", "a"],
["?", "ae"],
["?", "e"],
["?", "g"],
["?", "h"],
["?", "h"],
["?", "h"],
["?", "i"],
["?", "k"],
["?", "l"],
["?", "m"],
["?", "m"],
["?", "oe"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "r"],
["?", "t"],
["?", "v"],
["?", "w"],
["?", "y"],
["?", "tz"],
["ú", "u"],
["u", "u"],
["u", "u"],
["û", "u"],
["?", "u"],
["ü", "u"],
["u", "u"],
["u", "u"],
["u", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["ù", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["u", "u"],
["?", "u"],
["u", "u"],
["u", "u"],
["?", "u"],
["?", "u"],
["?", "ue"],
["?", "um"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "v"],
["?", "vy"],
["?", "w"],
["w", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "w"],
["?", "x"],
["?", "x"],
["?", "x"],
["ý", "y"],
["y", "y"],
["ÿ", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["?", "y"],
["z", "z"],
["ž", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["z", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["?", "z"],
["z", "z"],
["?", "z"],
["?", "ff"],
["?", "ffi"],
["?", "ffl"],
["?", "fi"],
["?", "fl"],
["?", "ij"],
["œ", "oe"],
["?", "st"],
["?", "a"],
["?", "e"],
["?", "i"],
["?", "j"],
["?", "o"],
["?", "r"],
["?", "u"],
["?", "v"],
["?", "x"],
]);
Based on the solution by Jason Bunting, here is what I use now.
The whole thing is for the jQuery tablesorter plug-in: For (nearly correct) sorting of non-English tables with tablesorter plugin it is necessary to make use of a custom textExtraction
function.
This one:
'dd.mm.yyyy'
) to a recognized format ('yyyy-mm-dd'
)Be careful to save the JavaScript file in UTF-8 encoding or it won't work.
// file encoding must be UTF-8!
function getTextExtractor()
{
return (function() {
var patternLetters = /[öäüÖÄÜáàâéèêúùûóòôÁÀÂÉÈÊÚÙÛÓÒÔß]/g;
var patternDateDmy = /^(?:\D+)?(\d{1,2})\.(\d{1,2})\.(\d{2,4})$/;
var lookupLetters = {
"ä": "a", "ö": "o", "ü": "u",
"Ä": "A", "Ö": "O", "Ü": "U",
"á": "a", "à": "a", "â": "a",
"é": "e", "è": "e", "ê": "e",
"ú": "u", "ù": "u", "û": "u",
"ó": "o", "ò": "o", "ô": "o",
"Á": "A", "À": "A", "Â": "A",
"É": "E", "È": "E", "Ê": "E",
"Ú": "U", "Ù": "U", "Û": "U",
"Ó": "O", "Ò": "O", "Ô": "O",
"ß": "s"
};
var letterTranslator = function(match) {
return lookupLetters[match] || match;
}
return function(node) {
var text = $.trim($(node).text());
var date = text.match(patternDateDmy);
if (date)
return [date[3], date[2], date[1]].join("-");
else
return text.replace(patternLetters, letterTranslator);
}
})();
}
You can use it like this:
$("table.sortable").tablesorter({
textExtraction: getTextExtractor()
});
The complete solution to your request is:
function convert_accented_characters(str){
var conversions = new Object();
conversions['ae'] = 'ä|æ|?';
conversions['oe'] = 'ö|œ';
conversions['ue'] = 'ü';
conversions['Ae'] = 'Ä';
conversions['Ue'] = 'Ü';
conversions['Oe'] = 'Ö';
conversions['A'] = 'À|Á|Â|Ã|Ä|Å|?|A|A|A|A';
conversions['a'] = 'à|á|â|ã|å|?|a|a|a|a|ª';
conversions['C'] = 'Ç|C|C|C|C';
conversions['c'] = 'ç|c|c|c|c';
conversions['D'] = 'Ð|D|Ð';
conversions['d'] = 'ð|d|d';
conversions['E'] = 'È|É|Ê|Ë|E|E|E|E|E';
conversions['e'] = 'è|é|ê|ë|e|e|e|e|e';
conversions['G'] = 'G|G|G|G';
conversions['g'] = 'g|g|g|g';
conversions['H'] = 'H|H';
conversions['h'] = 'h|h';
conversions['I'] = 'Ì|Í|Î|Ï|I|I|I|I|I|I';
conversions['i'] = 'ì|í|î|ï|i|i|i|i|i|i';
conversions['J'] = 'J';
conversions['j'] = 'j';
conversions['K'] = 'K';
conversions['k'] = 'k';
conversions['L'] = 'L|L|L|?|L';
conversions['l'] = 'l|l|l|?|l';
conversions['N'] = 'Ñ|N|N|N';
conversions['n'] = 'ñ|n|n|n|?';
conversions['O'] = 'Ò|Ó|Ô|Õ|O|O|O|O|O|Ø|?';
conversions['o'] = 'ò|ó|ô|õ|o|o|o|o|o|ø|?|º';
conversions['R'] = 'R|R|R';
conversions['r'] = 'r|r|r';
conversions['S'] = 'S|S|S|Š';
conversions['s'] = 's|s|s|š|?';
conversions['T'] = 'T|T|T';
conversions['t'] = 't|t|t';
conversions['U'] = 'Ù|Ú|Û|U|U|U|U|U|U|U|U|U|U|U|U';
conversions['u'] = 'ù|ú|û|u|u|u|u|u|u|u|u|u|u|u|u';
conversions['Y'] = 'Ý|Ÿ|Y';
conversions['y'] = 'ý|ÿ|y';
conversions['W'] = 'W';
conversions['w'] = 'w';
conversions['Z'] = 'Z|Z|Ž';
conversions['z'] = 'z|z|ž';
conversions['AE'] = 'Æ|?';
conversions['ss'] = 'ß';
conversions['IJ'] = '?';
conversions['ij'] = '?';
conversions['OE'] = 'Œ';
conversions['f'] = 'ƒ';
for(var i in conversions){
var re = new RegExp(conversions[i],"g");
str = str.replace(re,i);
}
return str;
}
I just wanted to post my solution using String#localeCompare
const base_chars = [_x000D_
'1', '2', '3', '4', '5', '6', '7', '8', '9',_x000D_
'0', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',_x000D_
'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',_x000D_
'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',_x000D_
'-', '_', ' '_x000D_
];_x000D_
const fix = str => str.normalize('NFKD').split('')_x000D_
.map(c => base_chars.find(bc => bc.localeCompare(c, 'en', { sensitivity: 'base' })==0))_x000D_
.join('');_x000D_
_x000D_
const str = 'OÒ óëå-123';_x000D_
console.log(`fix(${str}) = ${fix(str)}`);
_x000D_
If you're looking specifically for a way to convert accented characters to non-accented characters, rather than a way to sort accented characters, with a little finagling, the String.localeCompare function can be manipulated to find the basic latin characters that match the extended ones. For example, you might want to produce a human friendly url slug from a page title. If so, you can do something like this:
var baseChars = [];_x000D_
for (var i = 97; i < 97 + 26; i++) {_x000D_
baseChars.push(String.fromCharCode(i));_x000D_
}_x000D_
_x000D_
//if needed, handle fancy compound characters_x000D_
baseChars = baseChars.concat('ss,aa,ae,ao,au,av,ay,dz,hv,lj,nj,oi,ou,oo,tz,vy'.split(','));_x000D_
_x000D_
function isUpperCase(c) { return c !== c.toLocaleLowerCase() }_x000D_
_x000D_
function toBaseChar(c, opts) {_x000D_
opts = opts || {};_x000D_
//if (!('nonAlphaChar' in opts)) opts.nonAlphaChar = '';_x000D_
//if (!('noMatchChar' in opts)) opts.noMatchChar = '';_x000D_
if (!('locale' in opts)) opts.locale = 'en';_x000D_
_x000D_
var cOpts = {sensitivity: 'base'};_x000D_
_x000D_
//exit early for any non-alphabetical character_x000D_
if (c.localeCompare('9', opts.locale, cOpts) <= 0) return opts.nonAlphaChar === undefined ? c : opts.nonAlphaChar;_x000D_
_x000D_
for (var i = 0; i < baseChars.length; i++) {_x000D_
var baseChar = baseChars[i];_x000D_
_x000D_
var comp = c.localeCompare(baseChar, opts.locale, cOpts);_x000D_
if (comp == 0) return (isUpperCase(c)) ? baseChar.toUpperCase() : baseChar;_x000D_
}_x000D_
_x000D_
return opts.noMatchChar === undefined ? c : opts.noMatchChar;_x000D_
}_x000D_
_x000D_
function latinify(str, opts) {_x000D_
return str.replace(/[^\w\s\d]/g, function(c) {_x000D_
return toBaseChar(c, opts);_x000D_
})_x000D_
}_x000D_
_x000D_
// Example:_x000D_
console.log(latinify('Ceština Tsehesenestsestotse Tshiven?a Emigliàn–Rumagnòl Slovenšcina Português Ti?ng Vi?t Straße'))_x000D_
_x000D_
// "Cestina Tsehesenestsestotse Tshivenda Emiglian–Rumagnol Slovenscina Portugues Tieng Viet Strasse"
_x000D_
This should perform quite well, but if further optimization were needed, a binary search could be used with localeCompare
as the comparator to locate the base character. Note that case is preserved, and options allow for either preserving, replacing, or removing characters that aren't alphabetical, or do not have matching latin characters they can be replaced with. This implementation is faster and more flexible, and should work with new characters as they are added. The disadvantage is that compound characters like '?' have to be handled specifically, if they need to be supported.
Basing on existing answers and some suggestions, I've created this one:
String.prototype.removeAccents = function() {
var removalMap = {
'A' : /[A?AÀÁÂ????ÃAA??????ÄA?Å?A??????A]/g,
'AA' : /[?]/g,
'AE' : /[Æ??]/g,
'AO' : /[?]/g,
'AU' : /[?]/g,
'AV' : /[??]/g,
'AY' : /[?]/g,
'B' : /[B?B??????]/g,
'C' : /[C?CCCCCÇ????]/g,
'D' : /[D?D?D????Ð??Ð?]/g,
'DZ' : /[??]/g,
'Dz' : /[??]/g,
'E' : /[E?EÈÉÊ?????E??EEË?E??????E????]/g,
'F' : /[F?F?ƒ?]/g,
'G' : /[G?G?G?GGGGG????]/g,
'H' : /[H?HH??????H???]/g,
'I' : /[I?IÌÍÎIIIIÏ??I???I?I]/g,
'J' : /[J?JJ?]/g,
'K' : /[K?K?K?K???????]/g,
'L' : /[L?L?LL??L??L??????]/g,
'LJ' : /[?]/g,
'Lj' : /[?]/g,
'M' : /[M?M?????]/g,
'N' : /[N?N?NÑ?N?N??????]/g,
'NJ' : /[?]/g,
'Nj' : /[?]/g,
'O' : /[O?OÒÓÔ????Õ???O??O??Ö??OO??O???????OOØ??O??]/g,
'OI' : /[?]/g,
'OO' : /[?]/g,
'OU' : /[?]/g,
'P' : /[P?P???????]/g,
'Q' : /[Q?Q???]/g,
'R' : /[R?RR?R????R??????]/g,
'S' : /[S?S?S?S?Š????S???]/g,
'T' : /[T?T?T??T??T?T??]/g,
'TZ' : /[?]/g,
'U' : /[U?UÙÚÛU?U?UÜUUUU?UUU??U???????U???]/g,
'V' : /[V?V?????]/g,
'VY' : /[?]/g,
'W' : /[W?W??W????]/g,
'X' : /[X?X??]/g,
'Y' : /[Y?Y?ÝY???Ÿ?????]/g,
'Z' : /[Z?ZZ?ZŽ???????]/g,
'a' : /[a?a?àáâ????ãaa??????äa?å?a??????a??]/g,
'aa' : /[?]/g,
'ae' : /[æ??]/g,
'ao' : /[?]/g,
'au' : /[?]/g,
'av' : /[??]/g,
'ay' : /[?]/g,
'b' : /[b?b???b??]/g,
'c' : /[c?cccccç?????]/g,
'd' : /[d?d?d????d????]/g,
'dz' : /[??]/g,
'e' : /[e?eèéê?????e??eeë?e??????e?????]/g,
'f' : /[f?f?ƒ?]/g,
'g' : /[g?g?g?ggggg????]/g,
'h' : /[h?hh???????h???]/g,
'hv' : /[?]/g,
'i' : /[i?iìíîiiiï??i???i??i]/g,
'j' : /[j?jjj?]/g,
'k' : /[k?k?k?k???????]/g,
'l' : /[l?l?ll??l???ll?????]/g,
'lj' : /[?]/g,
'm' : /[m?m?????]/g,
'n' : /[n?n?nñ?n?n???????]/g,
'nj' : /[?]/g,
'o' : /[o?oòóô????õ???o??o??ö??oo??o???????ooø?????]/g,
'oi' : /[?]/g,
'ou' : /[?]/g,
'oo' : /[?]/g,
'p' : /[p?p???????]/g,
'q' : /[q?q???]/g,
'r' : /[r?rr?r????r??????]/g,
's' : /[s?sßs?s?š????s????]/g,
't' : /[t?t??t??t??t????]/g,
'tz' : /[?]/g,
'u' : /[u?uùúûu?u?uüuuuu?uuu??u???????u???]/g,
'v' : /[v?v?????]/g,
'vy' : /[?]/g,
'w' : /[w?w??w?????]/g,
'x' : /[x?x??]/g,
'y' : /[y?y?ýy???ÿ??????]/g,
'z' : /[z?zz?zž??z????]/g,
};
var str = this;
for(var latin in removalMap) {
var nonLatin = removalMap[latin];
str = str.replace(nonLatin , latin);
}
return str;
}
It uses real chars instead of unicode list and works well.
You can use it like
"aaa".removeAccents(); // returns "aaa"
You can easily convert this function to not be string prototype. However, as I'm fan of using string prototype in such cases, you'll have to do it yourself.
The correct terminology for such accents is Diacritics. After Googling this term, I found this function which is part of backbone.paginator
. It has a very complete collection of Diacritics and replaces them with their most intuitive ascii character. I found this to be the most complete Javascript solution available today.
The full function for future reference:
function removeDiacritics (str) {
var defaultDiacriticsRemovalMap = [
{'base':'A', 'letters':/[\u0041\u24B6\uFF21\u00C0\u00C1\u00C2\u1EA6\u1EA4\u1EAA\u1EA8\u00C3\u0100\u0102\u1EB0\u1EAE\u1EB4\u1EB2\u0226\u01E0\u00C4\u01DE\u1EA2\u00C5\u01FA\u01CD\u0200\u0202\u1EA0\u1EAC\u1EB6\u1E00\u0104\u023A\u2C6F]/g},
{'base':'AA','letters':/[\uA732]/g},
{'base':'AE','letters':/[\u00C6\u01FC\u01E2]/g},
{'base':'AO','letters':/[\uA734]/g},
{'base':'AU','letters':/[\uA736]/g},
{'base':'AV','letters':/[\uA738\uA73A]/g},
{'base':'AY','letters':/[\uA73C]/g},
{'base':'B', 'letters':/[\u0042\u24B7\uFF22\u1E02\u1E04\u1E06\u0243\u0182\u0181]/g},
{'base':'C', 'letters':/[\u0043\u24B8\uFF23\u0106\u0108\u010A\u010C\u00C7\u1E08\u0187\u023B\uA73E]/g},
{'base':'D', 'letters':/[\u0044\u24B9\uFF24\u1E0A\u010E\u1E0C\u1E10\u1E12\u1E0E\u0110\u018B\u018A\u0189\uA779]/g},
{'base':'DZ','letters':/[\u01F1\u01C4]/g},
{'base':'Dz','letters':/[\u01F2\u01C5]/g},
{'base':'E', 'letters':/[\u0045\u24BA\uFF25\u00C8\u00C9\u00CA\u1EC0\u1EBE\u1EC4\u1EC2\u1EBC\u0112\u1E14\u1E16\u0114\u0116\u00CB\u1EBA\u011A\u0204\u0206\u1EB8\u1EC6\u0228\u1E1C\u0118\u1E18\u1E1A\u0190\u018E]/g},
{'base':'F', 'letters':/[\u0046\u24BB\uFF26\u1E1E\u0191\uA77B]/g},
{'base':'G', 'letters':/[\u0047\u24BC\uFF27\u01F4\u011C\u1E20\u011E\u0120\u01E6\u0122\u01E4\u0193\uA7A0\uA77D\uA77E]/g},
{'base':'H', 'letters':/[\u0048\u24BD\uFF28\u0124\u1E22\u1E26\u021E\u1E24\u1E28\u1E2A\u0126\u2C67\u2C75\uA78D]/g},
{'base':'I', 'letters':/[\u0049\u24BE\uFF29\u00CC\u00CD\u00CE\u0128\u012A\u012C\u0130\u00CF\u1E2E\u1EC8\u01CF\u0208\u020A\u1ECA\u012E\u1E2C\u0197]/g},
{'base':'J', 'letters':/[\u004A\u24BF\uFF2A\u0134\u0248]/g},
{'base':'K', 'letters':/[\u004B\u24C0\uFF2B\u1E30\u01E8\u1E32\u0136\u1E34\u0198\u2C69\uA740\uA742\uA744\uA7A2]/g},
{'base':'L', 'letters':/[\u004C\u24C1\uFF2C\u013F\u0139\u013D\u1E36\u1E38\u013B\u1E3C\u1E3A\u0141\u023D\u2C62\u2C60\uA748\uA746\uA780]/g},
{'base':'LJ','letters':/[\u01C7]/g},
{'base':'Lj','letters':/[\u01C8]/g},
{'base':'M', 'letters':/[\u004D\u24C2\uFF2D\u1E3E\u1E40\u1E42\u2C6E\u019C]/g},
{'base':'N', 'letters':/[\u004E\u24C3\uFF2E\u01F8\u0143\u00D1\u1E44\u0147\u1E46\u0145\u1E4A\u1E48\u0220\u019D\uA790\uA7A4]/g},
{'base':'NJ','letters':/[\u01CA]/g},
{'base':'Nj','letters':/[\u01CB]/g},
{'base':'O', 'letters':/[\u004F\u24C4\uFF2F\u00D2\u00D3\u00D4\u1ED2\u1ED0\u1ED6\u1ED4\u00D5\u1E4C\u022C\u1E4E\u014C\u1E50\u1E52\u014E\u022E\u0230\u00D6\u022A\u1ECE\u0150\u01D1\u020C\u020E\u01A0\u1EDC\u1EDA\u1EE0\u1EDE\u1EE2\u1ECC\u1ED8\u01EA\u01EC\u00D8\u01FE\u0186\u019F\uA74A\uA74C]/g},
{'base':'OI','letters':/[\u01A2]/g},
{'base':'OO','letters':/[\uA74E]/g},
{'base':'OU','letters':/[\u0222]/g},
{'base':'P', 'letters':/[\u0050\u24C5\uFF30\u1E54\u1E56\u01A4\u2C63\uA750\uA752\uA754]/g},
{'base':'Q', 'letters':/[\u0051\u24C6\uFF31\uA756\uA758\u024A]/g},
{'base':'R', 'letters':/[\u0052\u24C7\uFF32\u0154\u1E58\u0158\u0210\u0212\u1E5A\u1E5C\u0156\u1E5E\u024C\u2C64\uA75A\uA7A6\uA782]/g},
{'base':'S', 'letters':/[\u0053\u24C8\uFF33\u1E9E\u015A\u1E64\u015C\u1E60\u0160\u1E66\u1E62\u1E68\u0218\u015E\u2C7E\uA7A8\uA784]/g},
{'base':'T', 'letters':/[\u0054\u24C9\uFF34\u1E6A\u0164\u1E6C\u021A\u0162\u1E70\u1E6E\u0166\u01AC\u01AE\u023E\uA786]/g},
{'base':'TZ','letters':/[\uA728]/g},
{'base':'U', 'letters':/[\u0055\u24CA\uFF35\u00D9\u00DA\u00DB\u0168\u1E78\u016A\u1E7A\u016C\u00DC\u01DB\u01D7\u01D5\u01D9\u1EE6\u016E\u0170\u01D3\u0214\u0216\u01AF\u1EEA\u1EE8\u1EEE\u1EEC\u1EF0\u1EE4\u1E72\u0172\u1E76\u1E74\u0244]/g},
{'base':'V', 'letters':/[\u0056\u24CB\uFF36\u1E7C\u1E7E\u01B2\uA75E\u0245]/g},
{'base':'VY','letters':/[\uA760]/g},
{'base':'W', 'letters':/[\u0057\u24CC\uFF37\u1E80\u1E82\u0174\u1E86\u1E84\u1E88\u2C72]/g},
{'base':'X', 'letters':/[\u0058\u24CD\uFF38\u1E8A\u1E8C]/g},
{'base':'Y', 'letters':/[\u0059\u24CE\uFF39\u1EF2\u00DD\u0176\u1EF8\u0232\u1E8E\u0178\u1EF6\u1EF4\u01B3\u024E\u1EFE]/g},
{'base':'Z', 'letters':/[\u005A\u24CF\uFF3A\u0179\u1E90\u017B\u017D\u1E92\u1E94\u01B5\u0224\u2C7F\u2C6B\uA762]/g},
{'base':'a', 'letters':/[\u0061\u24D0\uFF41\u1E9A\u00E0\u00E1\u00E2\u1EA7\u1EA5\u1EAB\u1EA9\u00E3\u0101\u0103\u1EB1\u1EAF\u1EB5\u1EB3\u0227\u01E1\u00E4\u01DF\u1EA3\u00E5\u01FB\u01CE\u0201\u0203\u1EA1\u1EAD\u1EB7\u1E01\u0105\u2C65\u0250]/g},
{'base':'aa','letters':/[\uA733]/g},
{'base':'ae','letters':/[\u00E6\u01FD\u01E3]/g},
{'base':'ao','letters':/[\uA735]/g},
{'base':'au','letters':/[\uA737]/g},
{'base':'av','letters':/[\uA739\uA73B]/g},
{'base':'ay','letters':/[\uA73D]/g},
{'base':'b', 'letters':/[\u0062\u24D1\uFF42\u1E03\u1E05\u1E07\u0180\u0183\u0253]/g},
{'base':'c', 'letters':/[\u0063\u24D2\uFF43\u0107\u0109\u010B\u010D\u00E7\u1E09\u0188\u023C\uA73F\u2184]/g},
{'base':'d', 'letters':/[\u0064\u24D3\uFF44\u1E0B\u010F\u1E0D\u1E11\u1E13\u1E0F\u0111\u018C\u0256\u0257\uA77A]/g},
{'base':'dz','letters':/[\u01F3\u01C6]/g},
{'base':'e', 'letters':/[\u0065\u24D4\uFF45\u00E8\u00E9\u00EA\u1EC1\u1EBF\u1EC5\u1EC3\u1EBD\u0113\u1E15\u1E17\u0115\u0117\u00EB\u1EBB\u011B\u0205\u0207\u1EB9\u1EC7\u0229\u1E1D\u0119\u1E19\u1E1B\u0247\u025B\u01DD]/g},
{'base':'f', 'letters':/[\u0066\u24D5\uFF46\u1E1F\u0192\uA77C]/g},
{'base':'g', 'letters':/[\u0067\u24D6\uFF47\u01F5\u011D\u1E21\u011F\u0121\u01E7\u0123\u01E5\u0260\uA7A1\u1D79\uA77F]/g},
{'base':'h', 'letters':/[\u0068\u24D7\uFF48\u0125\u1E23\u1E27\u021F\u1E25\u1E29\u1E2B\u1E96\u0127\u2C68\u2C76\u0265]/g},
{'base':'hv','letters':/[\u0195]/g},
{'base':'i', 'letters':/[\u0069\u24D8\uFF49\u00EC\u00ED\u00EE\u0129\u012B\u012D\u00EF\u1E2F\u1EC9\u01D0\u0209\u020B\u1ECB\u012F\u1E2D\u0268\u0131]/g},
{'base':'j', 'letters':/[\u006A\u24D9\uFF4A\u0135\u01F0\u0249]/g},
{'base':'k', 'letters':/[\u006B\u24DA\uFF4B\u1E31\u01E9\u1E33\u0137\u1E35\u0199\u2C6A\uA741\uA743\uA745\uA7A3]/g},
{'base':'l', 'letters':/[\u006C\u24DB\uFF4C\u0140\u013A\u013E\u1E37\u1E39\u013C\u1E3D\u1E3B\u017F\u0142\u019A\u026B\u2C61\uA749\uA781\uA747]/g},
{'base':'lj','letters':/[\u01C9]/g},
{'base':'m', 'letters':/[\u006D\u24DC\uFF4D\u1E3F\u1E41\u1E43\u0271\u026F]/g},
{'base':'n', 'letters':/[\u006E\u24DD\uFF4E\u01F9\u0144\u00F1\u1E45\u0148\u1E47\u0146\u1E4B\u1E49\u019E\u0272\u0149\uA791\uA7A5]/g},
{'base':'nj','letters':/[\u01CC]/g},
{'base':'o', 'letters':/[\u006F\u24DE\uFF4F\u00F2\u00F3\u00F4\u1ED3\u1ED1\u1ED7\u1ED5\u00F5\u1E4D\u022D\u1E4F\u014D\u1E51\u1E53\u014F\u022F\u0231\u00F6\u022B\u1ECF\u0151\u01D2\u020D\u020F\u01A1\u1EDD\u1EDB\u1EE1\u1EDF\u1EE3\u1ECD\u1ED9\u01EB\u01ED\u00F8\u01FF\u0254\uA74B\uA74D\u0275]/g},
{'base':'oi','letters':/[\u01A3]/g},
{'base':'ou','letters':/[\u0223]/g},
{'base':'oo','letters':/[\uA74F]/g},
{'base':'p','letters':/[\u0070\u24DF\uFF50\u1E55\u1E57\u01A5\u1D7D\uA751\uA753\uA755]/g},
{'base':'q','letters':/[\u0071\u24E0\uFF51\u024B\uA757\uA759]/g},
{'base':'r','letters':/[\u0072\u24E1\uFF52\u0155\u1E59\u0159\u0211\u0213\u1E5B\u1E5D\u0157\u1E5F\u024D\u027D\uA75B\uA7A7\uA783]/g},
{'base':'s','letters':/[\u0073\u24E2\uFF53\u00DF\u015B\u1E65\u015D\u1E61\u0161\u1E67\u1E63\u1E69\u0219\u015F\u023F\uA7A9\uA785\u1E9B]/g},
{'base':'t','letters':/[\u0074\u24E3\uFF54\u1E6B\u1E97\u0165\u1E6D\u021B\u0163\u1E71\u1E6F\u0167\u01AD\u0288\u2C66\uA787]/g},
{'base':'tz','letters':/[\uA729]/g},
{'base':'u','letters':/[\u0075\u24E4\uFF55\u00F9\u00FA\u00FB\u0169\u1E79\u016B\u1E7B\u016D\u00FC\u01DC\u01D8\u01D6\u01DA\u1EE7\u016F\u0171\u01D4\u0215\u0217\u01B0\u1EEB\u1EE9\u1EEF\u1EED\u1EF1\u1EE5\u1E73\u0173\u1E77\u1E75\u0289]/g},
{'base':'v','letters':/[\u0076\u24E5\uFF56\u1E7D\u1E7F\u028B\uA75F\u028C]/g},
{'base':'vy','letters':/[\uA761]/g},
{'base':'w','letters':/[\u0077\u24E6\uFF57\u1E81\u1E83\u0175\u1E87\u1E85\u1E98\u1E89\u2C73]/g},
{'base':'x','letters':/[\u0078\u24E7\uFF58\u1E8B\u1E8D]/g},
{'base':'y','letters':/[\u0079\u24E8\uFF59\u1EF3\u00FD\u0177\u1EF9\u0233\u1E8F\u00FF\u1EF7\u1E99\u1EF5\u01B4\u024F\u1EFF]/g},
{'base':'z','letters':/[\u007A\u24E9\uFF5A\u017A\u1E91\u017C\u017E\u1E93\u1E95\u01B6\u0225\u0240\u2C6C\uA763]/g}
];
for(var i=0; i<defaultDiacriticsRemovalMap.length; i++) {
str = str.replace(defaultDiacriticsRemovalMap[i].letters, defaultDiacriticsRemovalMap[i].base);
}
return str;
}
Long time ago I did this in Java and found someone else's solution based on a single string that captures part of the Unicode table that was important for the conversion - the rest was converted to ? or any other replacement character. So I tried to convert it to JavaScript. Mind that I'm no JS expert. :-)
TAB_00C0 = "AAAAAAACEEEEIIII" +
"DNOOOOO*OUUUUYIs" +
"aaaaaaaceeeeiiii" +
"?nooooo/ouuuuy?y" +
"AaAaAaCcCcCcCcDd" +
"DdEeEeEeEeEeGgGg" +
"GgGgHhHhIiIiIiIi" +
"IiJjJjKkkLlLlLlL" +
"lLlNnNnNnnNnOoOo" +
"OoOoRrRrRrSsSsSs" +
"SsTtTtTtUuUuUuUu" +
"UuUuWwYyYZzZzZzF";
function stripDiacritics(source) {
var result = source.split('');
for (var i = 0; i < result.length; i++) {
var c = source.charCodeAt(i);
if (c >= 0x00c0 && c <= 0x017f) {
result[i] = String.fromCharCode(TAB_00C0.charCodeAt(c - 0x00c0));
} else if (c > 127) {
result[i] = '?';
}
}
return result.join('');
}
stripDiacritics("Šupa, co? lštcžýæøåHð")
This converts most of latin1+2 Unicode characters. It is not able to translate single char to multiple. I don't know its performance on JS, in Java this is by far the fastest of common solutions (6-50x), there is no map, there is no regex, nothing. It produces strict ASCII output, potentially with a loss of information, but the size of the output matches the input.
I tested the snippet with http://www.webtoolkitonline.com/javascript-tester.html and it produced Supa, co? lstczyaoa??
as expected.
Source: Stackoverflow.com