Fastest way I found is via inner1d. Here's how it compares to other numpy methods:
import numpy as np
from numpy.core.umath_tests import inner1d
V = np.random.random_sample((10**6,3,)) # 1 million vectors
A = np.sqrt(np.einsum('...i,...i', V, V))
B = np.linalg.norm(V,axis=1)
C = np.sqrt((V ** 2).sum(-1))
D = np.sqrt((V*V).sum(axis=1))
E = np.sqrt(inner1d(V,V))
print [np.allclose(E,x) for x in [A,B,C,D]] # [True, True, True, True]
import cProfile
cProfile.run("np.sqrt(np.einsum('...i,...i', V, V))") # 3 function calls in 0.013 seconds
cProfile.run('np.linalg.norm(V,axis=1)') # 9 function calls in 0.029 seconds
cProfile.run('np.sqrt((V ** 2).sum(-1))') # 5 function calls in 0.028 seconds
cProfile.run('np.sqrt((V*V).sum(axis=1))') # 5 function calls in 0.027 seconds
cProfile.run('np.sqrt(inner1d(V,V))') # 2 function calls in 0.009 seconds
inner1d is ~3x faster than linalg.norm and a hair faster than einsum