I would like to split a very large string (let's say, 10,000 characters) into N-size chunks.
What would be the best way in terms of performance to do this?
For instance:
"1234567890"
split by 2 would become ["12", "34", "56", "78", "90"]
.
Would something like this be possible using String.prototype.match
and if so, would that be the best way to do it in terms of performance?
This question is related to
javascript
regex
string
split
Here is the code that I am using, it uses String.prototype.slice.
Yes it is quite long as an answer goes as it tries to follow current standards as close as possible and of course contains a reasonable amount of JSDOC comments. However, once minified, the code is only 828 bytes and once gzipped for transmission it is only 497 bytes.
The 1 method that this adds to String.prototype
(using Object.defineProperty where available) is:
A number of tests have been included to check the functionality.
Worried that the length of code will affect the performance? No need to worry, http://jsperf.com/chunk-string/3
Much of the extra code is there to be sure that the code will respond the same across multiple javascript environments.
/*jslint maxlen:80, browser:true, devel:true */_x000D_
_x000D_
/*_x000D_
* Properties used by toChunks._x000D_
*/_x000D_
_x000D_
/*property_x000D_
MAX_SAFE_INTEGER, abs, ceil, configurable, defineProperty, enumerable,_x000D_
floor, length, max, min, pow, prototype, slice, toChunks, value,_x000D_
writable_x000D_
*/_x000D_
_x000D_
/*_x000D_
* Properties used in the testing of toChunks implimentation._x000D_
*/_x000D_
_x000D_
/*property_x000D_
appendChild, createTextNode, floor, fromCharCode, getElementById, length,_x000D_
log, pow, push, random, toChunks_x000D_
*/_x000D_
_x000D_
(function () {_x000D_
'use strict';_x000D_
_x000D_
var MAX_SAFE_INTEGER = Number.MAX_SAFE_INTEGER || Math.pow(2, 53) - 1;_x000D_
_x000D_
/**_x000D_
* Defines a new property directly on an object, or modifies an existing_x000D_
* property on an object, and returns the object._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {Object} object_x000D_
* @param {string} property_x000D_
* @param {Object} descriptor_x000D_
* @return {Object}_x000D_
* @see https://goo.gl/CZnEqg_x000D_
*/_x000D_
function $defineProperty(object, property, descriptor) {_x000D_
if (Object.defineProperty) {_x000D_
Object.defineProperty(object, property, descriptor);_x000D_
} else {_x000D_
object[property] = descriptor.value;_x000D_
}_x000D_
_x000D_
return object;_x000D_
}_x000D_
_x000D_
/**_x000D_
* Returns true if the operands are strictly equal with no type conversion._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} a_x000D_
* @param {*} b_x000D_
* @return {boolean}_x000D_
* @see http://www.ecma-international.org/ecma-262/5.1/#sec-11.9.4_x000D_
*/_x000D_
function $strictEqual(a, b) {_x000D_
return a === b;_x000D_
}_x000D_
_x000D_
/**_x000D_
* Returns true if the operand inputArg is undefined._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg_x000D_
* @return {boolean}_x000D_
*/_x000D_
function $isUndefined(inputArg) {_x000D_
return $strictEqual(typeof inputArg, 'undefined');_x000D_
}_x000D_
_x000D_
/**_x000D_
* The abstract operation throws an error if its argument is a value that_x000D_
* cannot be converted to an Object, otherwise returns the argument._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg The object to be tested._x000D_
* @throws {TypeError} If inputArg is null or undefined._x000D_
* @return {*} The inputArg if coercible._x000D_
* @see https://goo.gl/5GcmVq_x000D_
*/_x000D_
function $requireObjectCoercible(inputArg) {_x000D_
var errStr;_x000D_
_x000D_
if (inputArg === null || $isUndefined(inputArg)) {_x000D_
errStr = 'Cannot convert argument to object: ' + inputArg;_x000D_
throw new TypeError(errStr);_x000D_
}_x000D_
_x000D_
return inputArg;_x000D_
}_x000D_
_x000D_
/**_x000D_
* The abstract operation converts its argument to a value of type string_x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg_x000D_
* @return {string}_x000D_
* @see https://people.mozilla.org/~jorendorff/es6-draft.html#sec-tostring_x000D_
*/_x000D_
function $toString(inputArg) {_x000D_
var type,_x000D_
val;_x000D_
_x000D_
if (inputArg === null) {_x000D_
val = 'null';_x000D_
} else {_x000D_
type = typeof inputArg;_x000D_
if (type === 'string') {_x000D_
val = inputArg;_x000D_
} else if (type === 'undefined') {_x000D_
val = type;_x000D_
} else {_x000D_
if (type === 'symbol') {_x000D_
throw new TypeError('Cannot convert symbol to string');_x000D_
}_x000D_
_x000D_
val = String(inputArg);_x000D_
}_x000D_
}_x000D_
_x000D_
return val;_x000D_
}_x000D_
_x000D_
/**_x000D_
* Returns a string only if the arguments is coercible otherwise throws an_x000D_
* error._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg_x000D_
* @throws {TypeError} If inputArg is null or undefined._x000D_
* @return {string}_x000D_
*/_x000D_
function $onlyCoercibleToString(inputArg) {_x000D_
return $toString($requireObjectCoercible(inputArg));_x000D_
}_x000D_
_x000D_
/**_x000D_
* The function evaluates the passed value and converts it to an integer._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg The object to be converted to an integer._x000D_
* @return {number} If the target value is NaN, null or undefined, 0 is_x000D_
* returned. If the target value is false, 0 is returned_x000D_
* and if true, 1 is returned._x000D_
* @see http://www.ecma-international.org/ecma-262/5.1/#sec-9.4_x000D_
*/_x000D_
function $toInteger(inputArg) {_x000D_
var number = +inputArg,_x000D_
val = 0;_x000D_
_x000D_
if ($strictEqual(number, number)) {_x000D_
if (!number || number === Infinity || number === -Infinity) {_x000D_
val = number;_x000D_
} else {_x000D_
val = (number > 0 || -1) * Math.floor(Math.abs(number));_x000D_
}_x000D_
}_x000D_
_x000D_
return val;_x000D_
}_x000D_
_x000D_
/**_x000D_
* The abstract operation ToLength converts its argument to an integer_x000D_
* suitable for use as the length of an array-like object._x000D_
*_x000D_
* @private_x000D_
* @function_x000D_
* @param {*} inputArg The object to be converted to a length._x000D_
* @return {number} If len <= +0 then +0 else if len is +INFINITY then_x000D_
* 2^53-1 else min(len, 2^53-1)._x000D_
* @see https://people.mozilla.org/~jorendorff/es6-draft.html#sec-tolength_x000D_
*/_x000D_
function $toLength(inputArg) {_x000D_
return Math.min(Math.max($toInteger(inputArg), 0), MAX_SAFE_INTEGER);_x000D_
}_x000D_
_x000D_
if (!String.prototype.toChunks) {_x000D_
/**_x000D_
* This method chunks a string into an array of strings of a specified_x000D_
* chunk size._x000D_
*_x000D_
* @function_x000D_
* @this {string} The string to be chunked._x000D_
* @param {Number} chunkSize The size of the chunks that the string will_x000D_
* be chunked into._x000D_
* @returns {Array} Returns an array of the chunked string._x000D_
*/_x000D_
$defineProperty(String.prototype, 'toChunks', {_x000D_
enumerable: false,_x000D_
configurable: true,_x000D_
writable: true,_x000D_
value: function (chunkSize) {_x000D_
var str = $onlyCoercibleToString(this),_x000D_
chunkLength = $toInteger(chunkSize),_x000D_
chunked = [],_x000D_
numChunks,_x000D_
length,_x000D_
index,_x000D_
start,_x000D_
end;_x000D_
_x000D_
if (chunkLength < 1) {_x000D_
return chunked;_x000D_
}_x000D_
_x000D_
length = $toLength(str.length);_x000D_
numChunks = Math.ceil(length / chunkLength);_x000D_
index = 0;_x000D_
start = 0;_x000D_
end = chunkLength;_x000D_
chunked.length = numChunks;_x000D_
while (index < numChunks) {_x000D_
chunked[index] = str.slice(start, end);_x000D_
start = end;_x000D_
end += chunkLength;_x000D_
index += 1;_x000D_
}_x000D_
_x000D_
return chunked;_x000D_
}_x000D_
});_x000D_
}_x000D_
}());_x000D_
_x000D_
/*_x000D_
* Some tests_x000D_
*/_x000D_
_x000D_
(function () {_x000D_
'use strict';_x000D_
_x000D_
var pre = document.getElementById('out'),_x000D_
chunkSizes = [],_x000D_
maxChunkSize = 512,_x000D_
testString = '',_x000D_
maxTestString = 100000,_x000D_
chunkSize = 0,_x000D_
index = 1;_x000D_
_x000D_
while (chunkSize < maxChunkSize) {_x000D_
chunkSize = Math.pow(2, index);_x000D_
chunkSizes.push(chunkSize);_x000D_
index += 1;_x000D_
}_x000D_
_x000D_
index = 0;_x000D_
while (index < maxTestString) {_x000D_
testString += String.fromCharCode(Math.floor(Math.random() * 95) + 32);_x000D_
index += 1;_x000D_
}_x000D_
_x000D_
function log(result) {_x000D_
pre.appendChild(document.createTextNode(result + '\n'));_x000D_
}_x000D_
_x000D_
function test() {_x000D_
var strLength = testString.length,_x000D_
czLength = chunkSizes.length,_x000D_
czIndex = 0,_x000D_
czValue,_x000D_
result,_x000D_
numChunks,_x000D_
pass;_x000D_
_x000D_
while (czIndex < czLength) {_x000D_
czValue = chunkSizes[czIndex];_x000D_
numChunks = Math.ceil(strLength / czValue);_x000D_
result = testString.toChunks(czValue);_x000D_
czIndex += 1;_x000D_
log('chunksize: ' + czValue);_x000D_
log(' Number of chunks:');_x000D_
log(' Calculated: ' + numChunks);_x000D_
log(' Actual:' + result.length);_x000D_
pass = result.length === numChunks;_x000D_
log(' First chunk size: ' + result[0].length);_x000D_
pass = pass && result[0].length === czValue;_x000D_
log(' Passed: ' + pass);_x000D_
log('');_x000D_
}_x000D_
}_x000D_
_x000D_
test();_x000D_
log('');_x000D_
log('Simple test result');_x000D_
log('abcdefghijklmnopqrstuvwxyz'.toChunks(3));_x000D_
}());
_x000D_
<pre id="out"></pre>
_x000D_
I have written an extended function, so the chunk length can also be an array of numbers, like [1,3]
String.prototype.chunkString = function(len) {
var _ret;
if (this.length < 1) {
return [];
}
if (typeof len === 'number' && len > 0) {
var _size = Math.ceil(this.length / len), _offset = 0;
_ret = new Array(_size);
for (var _i = 0; _i < _size; _i++) {
_ret[_i] = this.substring(_offset, _offset = _offset + len);
}
}
else if (typeof len === 'object' && len.length) {
var n = 0, l = this.length, chunk, that = this;
_ret = [];
do {
len.forEach(function(o) {
chunk = that.substring(n, n + o);
if (chunk !== '') {
_ret.push(chunk);
n += chunk.length;
}
});
if (n === 0) {
return undefined; // prevent an endless loop when len = [0]
}
} while (n < l);
}
return _ret;
};
The code
"1234567890123".chunkString([1,3])
will return:
[ '1', '234', '5', '678', '9', '012', '3' ]
it Split's large string in to Small strings of given words .
function chunkSubstr(str, words) {
var parts = str.split(" ") , values = [] , i = 0 , tmpVar = "";
$.each(parts, function(index, value) {
if(tmpVar.length < words){
tmpVar += " " + value;
}else{
values[i] = tmpVar.replace(/\s+/g, " ");
i++;
tmpVar = value;
}
});
if(values.length < 1 && parts.length > 0){
values[0] = tmpVar;
}
return values;
}
window.format = function(b, a) {
if (!b || isNaN(+a)) return a;
var a = b.charAt(0) == "-" ? -a : +a,
j = a < 0 ? a = -a : 0,
e = b.match(/[^\d\-\+#]/g),
h = e && e[e.length - 1] || ".",
e = e && e[1] && e[0] || ",",
b = b.split(h),
a = a.toFixed(b[1] && b[1].length),
a = +a + "",
d = b[1] && b[1].lastIndexOf("0"),
c = a.split(".");
if (!c[1] || c[1] && c[1].length <= d) a = (+a).toFixed(d + 1);
d = b[0].split(e);
b[0] = d.join("");
var f = b[0] && b[0].indexOf("0");
if (f > -1)
for (; c[0].length < b[0].length - f;) c[0] = "0" + c[0];
else +c[0] == 0 && (c[0] = "");
a = a.split(".");
a[0] = c[0];
if (c = d[1] && d[d.length -
1].length) {
for (var d = a[0], f = "", k = d.length % c, g = 0, i = d.length; g < i; g++) f += d.charAt(g), !((g - k + 1) % c) && g < i - c && (f += e);
a[0] = f
}
a[1] = b[1] && a[1] ? h + a[1] : "";
return (j ? "-" : "") + a[0] + a[1]
};
var str="1234567890";
var formatstr=format( "##,###.", str);
alert(formatstr);
This will split the string in reverse order with comma separated after 3 char's. If you want you can change the position.
const getChunksFromString = (str, chunkSize) => {
var regexChunk = new RegExp(`.{1,${chunkSize}}`, 'g') // '.' represents any character
return str.match(regexChunk)
}
Call it as needed
console.log(getChunksFromString("Hello world", 3)) // ["Hel", "lo ", "wor", "ld"]
My issue with the above solution is that it beark the string into formal size chunks regardless of the position in the sentences.
I think the following a better approach; although it needs some performance tweaking:
static chunkString(str, length, size,delimiter='\n' ) {
const result = [];
for (let i = 0; i < str.length; i++) {
const lastIndex = _.lastIndexOf(str, delimiter,size + i);
result.push(str.substr(i, lastIndex - i));
i = lastIndex;
}
return result;
}
You can definitely do something like
let pieces = "1234567890 ".split(/(.{2})/).filter(x => x.length == 2);
to get this:
[ '12', '34', '56', '78', '90' ]
If you want to dynamically input/adjust the chunk size so that the chunks are of size n, you can do this:
n = 2;
let pieces = "1234567890 ".split(new RegExp("(.{"+n.toString()+"})")).filter(x => x.length == n);
To find all possible size n chunks in the original string, try this:
let subs = new Set();
let n = 2;
let str = "1234567890 ";
let regex = new RegExp("(.{"+n.toString()+"})"); //set up regex expression dynamically encoded with n
for (let i = 0; i < n; i++){ //starting from all possible offsets from position 0 in the string
let pieces = str.split(regex).filter(x => x.length == n); //divide the string into chunks of size n...
for (let p of pieces) //...and add the chunks to the set
subs.add(p);
str = str.substr(1); //shift the string reading frame
}
You should end up with:
[ '12', '23', '34', '45', '56', '67', '78', '89', '90', '0 ' ]
Include both left and right version with pre-allocation. This is as fast as RegExp impl for small chunks but it goes faster as the chunk size grows. And it is memory efficent.
function chunkLeft (str, size = 3) {
if (typeof str === 'string') {
const length = str.length
const chunks = Array(Math.ceil(length / size))
for (let i = 0, index = 0; index < length; i++) {
chunks[i] = str.slice(index, index += size)
}
return chunks
}
}
function chunkRight (str, size = 3) {
if (typeof str === 'string') {
const length = str.length
const chunks = Array(Math.ceil(length / size))
if (length) {
chunks[0] = str.slice(0, length % size || size)
for (let i = 1, index = chunks[0].length; index < length; i++) {
chunks[i] = str.slice(index, index += size)
}
}
return chunks
}
}
console.log(chunkRight()) // undefined
console.log(chunkRight('')) // []
console.log(chunkRight('1')) // ["1"]
console.log(chunkRight('123')) // ["123"]
console.log(chunkRight('1234')) // ["1", "234"]
console.log(chunkRight('12345')) // ["12", "345"]
console.log(chunkRight('123456')) // ["123", "456"]
console.log(chunkRight('1234567')) // ["1", "234", "567"]
var l = str.length, lc = 0, chunks = [], c = 0, chunkSize = 2;
for (; lc < l; c++) {
chunks[c] = str.slice(lc, lc += chunkSize);
}
function chunkString(str, length = 10) {
let result = [],
offset = 0;
if (str.length <= length) return result.push(str) && result;
while (offset < str.length) {
result.push(str.substr(offset, length));
offset += length;
}
return result;
}
I would use a regex...
var chunkStr = function(str, chunkLength) {
return str.match(new RegExp('[\\s\\S]{1,' + +chunkLength + '}', 'g'));
}
You can use reduce()
without any regex:
(str, n) => {
return str.split('').reduce(
(acc, rec, index) => {
return ((index % n) || !(index)) ? acc.concat(rec) : acc.concat(',', rec)
},
''
).split(',')
}
Here's a solution I came up with for template strings after a little experimenting:
Usage:
chunkString(5)`testing123`
function chunkString(nSize) {_x000D_
return (strToChunk) => {_x000D_
let result = [];_x000D_
let chars = String(strToChunk).split('');_x000D_
_x000D_
for(let i = 0; i < (String(strToChunk).length / nSize); i++) {_x000D_
result = result.concat(chars.slice(i*nSize,(i+1)*nSize).join(''));_x000D_
}_x000D_
return result_x000D_
}_x000D_
}_x000D_
_x000D_
document.write(chunkString(5)`testing123`);_x000D_
// returns: testi,ng123_x000D_
_x000D_
document.write(chunkString(3)`testing123`);_x000D_
// returns: tes,tin,g12,3
_x000D_
match
, slice
, substr
and substring
match
and slice
for different chunk sizesmatch
and slice
with small chunk sizeBottom line:
match
is very inefficient, slice
is better, on Firefox substr
/substring
is better stillmatch
is even more inefficient for short strings (even with cached regex - probably due to regex parsing setup time)match
is even more inefficient for large chunk size (probably due to inability to "jump")match
outperforms slice
on older IE but still loses on all other systemsI created several faster variants which you can see on jsPerf. My favorite one is this:
function chunkSubstr(str, size) {
const numChunks = Math.ceil(str.length / size)
const chunks = new Array(numChunks)
for (let i = 0, o = 0; i < numChunks; ++i, o += size) {
chunks[i] = str.substr(o, size)
}
return chunks
}
var str = "123456789";
var chunks = [];
var chunkSize = 2;
while (str) {
if (str.length < chunkSize) {
chunks.push(str);
break;
}
else {
chunks.push(str.substr(0, chunkSize));
str = str.substr(chunkSize);
}
}
alert(chunks); // chunks == 12,34,56,78,9
What about this small piece of code:
function splitME(str, size) {
let subStr = new RegExp('.{1,' + size + '}', 'g');
return str.match(subStr);
};
In the form of a prototype function:
String.prototype.lsplit = function(){
return this.match(new RegExp('.{1,'+ ((arguments.length==1)?(isFinite(String(arguments[0]).trim())?arguments[0]:false):1) +'}', 'g'));
}
Using slice() method:
function returnChunksArray(str, chunkSize) {
var arr = [];
while(str !== '') {
arr.push(str.slice(0, chunkSize));
str = str.slice(chunkSize);
}
return arr;
}
The same can be done using substring() method.
function returnChunksArray(str, chunkSize) {
var arr = [];
while(str !== '') {
arr.push(str.substring(0, chunkSize));
str = str.substring(chunkSize);
}
return arr;
}
Surprise! You can use split to split.
var parts = "1234567890 ".split(/(.{2})/).filter(O=>O)
Results in [ '12', '34', '56', '78', '90', ' ' ]
This is a fast and straightforward solution -
function chunkString (str, len) {_x000D_
const size = Math.ceil(str.length/len)_x000D_
const r = Array(size)_x000D_
let offset = 0_x000D_
_x000D_
for (let i = 0; i < size; i++) {_x000D_
r[i] = str.substr(offset, len)_x000D_
offset += len_x000D_
}_x000D_
_x000D_
return r_x000D_
}_x000D_
_x000D_
console.log(chunkString("helloworld", 3))_x000D_
// => [ "hel", "low", "orl", "d" ]_x000D_
_x000D_
// 10,000 char string_x000D_
const bigString = "helloworld".repeat(1000)_x000D_
console.time("perf")_x000D_
const result = chunkString(bigString, 3)_x000D_
console.timeEnd("perf")_x000D_
console.log(result)_x000D_
// => perf: 0.385 ms_x000D_
// => [ "hel", "low", "orl", "dhe", "llo", "wor", ... ]
_x000D_
Source: Stackoverflow.com