I tried this with Visual Studio 2013 Express, using a pointer instead of an index, which sped up the process a bit. I suspect this is because the addressing is offset + register, instead of offset + register + (register<<3). C++ code.
uint64_t* bfrend = buffer+(size/8);
uint64_t* bfrptr;
// ...
{
startP = chrono::system_clock::now();
count = 0;
for (unsigned k = 0; k < 10000; k++){
// Tight unrolled loop with uint64_t
for (bfrptr = buffer; bfrptr < bfrend;){
count += __popcnt64(*bfrptr++);
count += __popcnt64(*bfrptr++);
count += __popcnt64(*bfrptr++);
count += __popcnt64(*bfrptr++);
}
}
endP = chrono::system_clock::now();
duration = chrono::duration_cast<std::chrono::nanoseconds>(endP-startP).count();
cout << "uint64_t\t" << count << '\t' << (duration/1.0E9) << " sec \t"
<< (10000.0*size)/(duration) << " GB/s" << endl;
}
assembly code: r10 = bfrptr, r15 = bfrend, rsi = count, rdi = buffer, r13 = k :
$LL5@main:
mov r10, rdi
cmp rdi, r15
jae SHORT $LN4@main
npad 4
$LL2@main:
mov rax, QWORD PTR [r10+24]
mov rcx, QWORD PTR [r10+16]
mov r8, QWORD PTR [r10+8]
mov r9, QWORD PTR [r10]
popcnt rdx, rax
popcnt rax, rcx
add rdx, rax
popcnt rax, r8
add r10, 32
add rdx, rax
popcnt rax, r9
add rsi, rax
add rsi, rdx
cmp r10, r15
jb SHORT $LL2@main
$LN4@main:
dec r13
jne SHORT $LL5@main