There are optimizations the compiler can't do because they only work for a reduced set of inputs.
Below there is c++ sample code that can do a faster division doing a 64bits "Multiplication by the reciprocal". Both numerator and denominator must be below certain threshold. Note that it must be compiled to use 64 bits instructions to be actually faster than normal division.
#include <stdio.h>
#include <chrono>
static const unsigned s_bc = 32;
static const unsigned long long s_p = 1ULL << s_bc;
static const unsigned long long s_hp = s_p / 2;
static unsigned long long s_f;
static unsigned long long s_fr;
static void fastDivInitialize(const unsigned d)
s_f = s_p / d;
s_fr = s_f * (s_p - (s_f * d));
static unsigned fastDiv(const unsigned n)
return (s_f * n + ((s_fr * n + s_hp) >> s_bc)) >> s_bc;
static bool fastDivCheck(const unsigned n, const unsigned d)
// 32 to 64 cycles latency on modern cpus
const unsigned expected = n / d;
// At least 10 cycles latency on modern cpus
const unsigned result = fastDiv(n);
if (result != expected)
printf("Failed for: %u/%u != %u\n", n, d, expected);
return false;
return true;
int main()
unsigned result = 0;
// Make sure to verify it works for your expected set of inputs
const unsigned MAX_N = 65535;
const unsigned MAX_D = 40000;
const double ONE_SECOND_COUNT = 1000000000.0;
auto t0 = std::chrono::steady_clock::now();
unsigned count = 0;
for (unsigned d = 1; d <= MAX_D; ++d)
for (unsigned n = 0; n <= MAX_N; ++n)
count += !fastDivCheck(n, d);
auto t1 = std::chrono::steady_clock::now();
printf("Errors: %u / %u (%.4fs)\n", count, MAX_D * (MAX_N + 1), (t1 - t0).count() / ONE_SECOND_COUNT);
t0 = t1;
for (unsigned d = 1; d <= MAX_D; ++d)
for (unsigned n = 0; n <= MAX_N; ++n)
result += fastDiv(n);
t1 = std::chrono::steady_clock::now();
printf("Fast division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);
t0 = t1;
count = 0;
for (unsigned d = 1; d <= MAX_D; ++d)
for (unsigned n = 0; n <= MAX_N; ++n)
result += n / d;
t1 = std::chrono::steady_clock::now();
printf("Normal division time: %.4fs\n", (t1 - t0).count() / ONE_SECOND_COUNT);
return result;