Python Vectorization with NumPy Practice Problems & Exercises
Practice: Vectorization with NumPy
← Back to lessonWrite sum_range(n) that returns the sum of all integers from 0 to n-1 (inclusive).
Constraint: do not use any Python for or while loop.
Example
sum_range(1_000_000) # → 499999500000
Solution
import numpy as np
def sum_range(n: int) -> int:
return int(np.arange(n).sum())
np.arange creates the array in one call; .sum() is a C-level reduction — no Python loop ever executes.
import numpy as np
def sum_range(n: int) -> int:
"""Return the sum of integers 0..n-1 using a single NumPy call."""
# TODO: create an array and sum it
pass
Expected Output
499999500000Hints
Hint 1: Use np.arange to create the array.
Hint 2: Call .sum() directly on the array — no Python loop needed.
Implement clamp(arr, lo, hi) that clips every element of arr to the range [lo, hi].
Example
clamp(np.array([-2, 0, 3, 7, 5]), 0, 5)
# → array([0, 0, 3, 5, 5])
Solution
import numpy as np
def clamp(arr: np.ndarray, lo: float, hi: float) -> np.ndarray:
return np.clip(arr, lo, hi)
np.clip is a ufunc-like operation executed entirely in C — one call, no Python overhead per element.
import numpy as np
def clamp(arr: np.ndarray, lo: float, hi: float) -> np.ndarray:
"""Return arr with every value clamped to [lo, hi]."""
pass
Expected Output
[0 0 3 5 5]Hints
Hint 1: np.clip(array, lo, hi) applies the clamp in one pass.
Hint 2: You do not need np.where or a loop.
Write even_elements(arr) that returns all even integers from the array without a Python loop.
Example
even_elements(np.arange(1, 10))
# → array([2, 4, 6, 8])
Solution
import numpy as np
def even_elements(arr: np.ndarray) -> np.ndarray:
return arr[arr % 2 == 0]
arr % 2 == 0 produces a boolean array. Using it as an index is fancy indexing — NumPy gathers matching elements in one C pass.
import numpy as np
def even_elements(arr: np.ndarray) -> np.ndarray:
"""Return only the even-valued elements of arr, preserving order."""
pass
Expected Output
[2 4 6 8]Hints
Hint 1: Create a boolean mask with arr % 2 == 0.
Hint 2: Index the array with that mask directly.
Implement outer_diff(a) that builds the full difference table of a 1-D array using broadcasting — no nested loops.
Example
outer_diff(np.array([0, 1, 2]))
# array([[ 0, 1, 2],
# [-1, 0, 1],
# [-2, -1, 0]])
Solution
import numpy as np
def outer_diff(a: np.ndarray) -> np.ndarray:
return a[:, np.newaxis] - a
a[:, np.newaxis] is shape (n, 1); plain a is (n,). NumPy broadcasts to (n, n) without allocating any intermediate Python objects.
import numpy as np
def outer_diff(a: np.ndarray) -> np.ndarray:
"""Return matrix M where M[i, j] = a[i] - a[j]."""
pass
Expected Output
[[ 0 1 2]\n [-1 0 1]\n [-2 -1 0]]Hints
Hint 1: Reshape one array to (n, 1) and the other stays (n,) — subtraction broadcasts.
Hint 2: a[:, np.newaxis] - a gives an (n, n) matrix without any loop.
Write zscore_rows(X) that normalises each row of a 2-D array to zero mean and unit variance.
Example
X = np.array([[1, 2, 3], [10, 20, 30]], dtype=float)
Z = zscore_rows(X)
# each row of Z has mean ≈ 0, std ≈ 1
Add a small epsilon (1e-8) to the denominator to avoid division by zero.Solution
keepdims=True preserves the (n, 1) shape, so the subtraction and division broadcast correctly across all columns.
import numpy as np
def zscore_rows(X: np.ndarray) -> np.ndarray:
"""Normalise each row of X to zero mean and unit variance."""
pass
Expected Output
Each row has mean ≈ 0 and std ≈ 1Hints
Hint 1: Compute mean and std with keepdims=True so shapes broadcast.
Hint 2: Subtract mean then divide by std along axis=1.
Implement a gather operation (read from arbitrary indices) and a scatter operation (write to arbitrary indices).
Example
arr = np.array([10, 20, 30, 40, 50])
idx = np.array([2, 0, 1, 2])
print(gather(arr, idx)) # [30 10 20 30]
arr2 = np.array([1, 2, 3, 4, 5])
scatter(arr2, np.array([3]), np.array([400])) # [1 2 3 400 5]
Solution
import numpy as np
def gather(arr: np.ndarray, idx: np.ndarray) -> np.ndarray:
return arr[idx]
def scatter(arr: np.ndarray, idx: np.ndarray, values: np.ndarray) -> np.ndarray:
result = arr.copy()
result[idx] = values
return result
Both operations are O(k) in pure C where k is the length of idx — no Python loop at all.
import numpy as np
def gather(arr: np.ndarray, idx: np.ndarray) -> np.ndarray:
"""Collect arr[i] for each i in idx (with repeats allowed)."""
pass
def scatter(arr: np.ndarray, idx: np.ndarray, values: np.ndarray) -> np.ndarray:
"""Return a copy of arr with arr[i] = values[k] for each (i=idx[k])."""
pass
Expected Output
gathered: [30 10 20 30]\nscattered: [1 2 3 400 5]Hints
Hint 1: Use an index array directly: arr[idx].
Hint 2: For scatter: arr[idx] = values assigns in one call.
Understand and manipulate NumPy memory layout. Implement two helpers:
inspect_layout: return a dict withshape,strides,c_contiguous,f_contiguous.force_c_order: return a C-contiguous copy of the array.
Example
arr = np.array([[1, 2, 3], [4, 5, 6]])
info = inspect_layout(arr)
# info["c_contiguous"] == True
# info["strides"] == (24, 8) for float64
arr_T = arr.T # now F-contiguous
c = force_c_order(arr_T)
inspect_layout(c)["c_contiguous"] # True
Solution
import numpy as np
def inspect_layout(arr: np.ndarray) -> dict:
return {
"shape": arr.shape,
"strides": arr.strides,
"c_contiguous": arr.flags.c_contiguous,
"f_contiguous": arr.flags.f_contiguous,
}
def force_c_order(arr: np.ndarray) -> np.ndarray:
return np.ascontiguousarray(arr)
Row-major (C) layout means consecutive row elements are adjacent in memory — iterating along axis 1 is cache-friendly. Fortran-order is column-major. Many NumPy and BLAS routines require contiguous input.
import numpy as np
def inspect_layout(arr: np.ndarray) -> dict:
"""Return a dict with keys: shape, strides, c_contiguous, f_contiguous."""
pass
def force_c_order(arr: np.ndarray) -> np.ndarray:
"""Return a C-contiguous copy of arr regardless of input layout."""
pass
Expected Output
C-contiguous: True | F-contiguous after transpose: TrueHints
Hint 1: np.ascontiguousarray forces C-order.
Hint 2: Check .flags.c_contiguous and .flags.f_contiguous.
Hint 3: np.asfortranarray forces F-order.
Implement pairwise_euclidean(X, Y) that computes the full (m, n) distance matrix between two point sets using only NumPy broadcast math — no scipy, no loops.
Example
X = np.random.randn(50, 128)
Y = np.random.randn(80, 128)
D = pairwise_euclidean(X, Y)
assert D.shape == (50, 80)
assert D.min() >= 0
Solution
import numpy as np
def pairwise_euclidean(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
# ||x - y||^2 = ||x||^2 + ||y||^2 - 2 x.y
X2 = np.sum(X ** 2, axis=1, keepdims=True) # (m, 1)
Y2 = np.sum(Y ** 2, axis=1, keepdims=True).T # (1, n)
cross = X @ Y.T # (m, n)
dist2 = X2 + Y2 - 2 * cross
return np.sqrt(np.clip(dist2, 0, None))
This avoids materialising an (m, n, d) tensor. The dominant cost is the (m, d) @ (d, n) BLAS GEMM — O(mnd) but at peak hardware throughput.
import numpy as np
def pairwise_euclidean(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
"""
Compute pairwise Euclidean distances between rows of X (m, d)
and rows of Y (n, d). Return matrix of shape (m, n).
No Python loops.
"""
pass
Expected Output
shape (m, n), matches scipy cdistHints
Hint 1: Expand (X - Y)^2 = X^2 + Y^2 - 2*[email protected] using broadcasting.
Hint 2: np.sum(X**2, axis=1, keepdims=True) gives (m, 1) row norms.
Hint 3: Clip negative values before sqrt to handle floating-point noise.
Implement batched_dot(A, B) using np.einsum. It must compute the dot product of each pair of rows (A[k], B[k]) and return a 1-D array of length batch.
Example
A = np.random.randn(32, 64)
B = np.random.randn(32, 64)
result = batched_dot(A, B)
assert result.shape == (32,)
# verify against loop
expected = np.array([A[i] @ B[i] for i in range(32)])
np.testing.assert_allclose(result, expected, rtol=1e-5)
Solution
import numpy as np
def batched_dot(A: np.ndarray, B: np.ndarray) -> np.ndarray:
return np.einsum("bi,bi->b", A, B)
einsum subscript explanation: b is the batch axis (kept in output), i is the inner dimension (contracted/summed). NumPy routes this to an optimised BLAS path when possible.
Alternatives (for comparison):
# multiply then sum — also correct, slightly different memory usage
(A * B).sum(axis=1)
import numpy as np
def batched_dot(A: np.ndarray, B: np.ndarray) -> np.ndarray:
"""
A: shape (batch, d)
B: shape (batch, d)
Return: shape (batch,) where result[k] = dot(A[k], B[k])
Use np.einsum — no loops.
"""
pass
Expected Output
shape (batch,), matches loop implementationHints
Hint 1: einsum("bi,bi->b", A, B) performs a batched dot product.
Hint 2: The subscript "bi,bi->b" means: for each batch item b, sum over i.
Implement a zero-copy sliding mean using NumPy stride tricks. The function must not allocate a new array for the window views — only the final mean output.
Example
arr = np.array([1.0, 2, 3, 4, 5, 6])
sliding_mean(arr, 3)
# array([2., 3., 4., 5.])
Solution
import numpy as np
def sliding_mean(arr: np.ndarray, window: int) -> np.ndarray:
view = np.lib.stride_tricks.sliding_window_view(arr, window)
return view.mean(axis=1)
sliding_window_view returns a view — no data is copied. The resulting array shape is (n - w + 1, w). Calling .mean(axis=1) reduces each window to a scalar in a single C pass.
Why this matters: time-series rolling statistics on millions of rows become single-line operations with no Python loop overhead.
import numpy as np
def sliding_mean(arr: np.ndarray, window: int) -> np.ndarray:
"""
Compute a 1-D sliding (rolling) mean with the given window size.
Return array of shape (len(arr) - window + 1,).
No Python loop — use stride tricks.
"""
pass
Expected Output
shape (n-w+1,), matches loop-based meanHints
Hint 1: np.lib.stride_tricks.sliding_window_view creates an (n-w+1, w) view.
Hint 2: Call .mean(axis=1) on that view — zero extra allocations.
Hint 3: The view shares memory with the original array.
Vectorize score_loop by implementing score_vectorized that produces identical results but uses no Python loop.
Then write a brief benchmark comparing both on an array of 1 million elements.
Expected behaviour
x = np.random.randn(1_000_000)
np.testing.assert_allclose(
score_loop(x, 0.5),
score_vectorized(x, 0.5),
rtol=1e-10
)
# score_vectorized should be 10x-100x faster
Solution
import numpy as np
import timeit
def score_vectorized(x: np.ndarray, threshold: float) -> np.ndarray:
return np.where(x > threshold, x * 2.0 - threshold, x * 0.5)
# Quick benchmark
if __name__ == "__main__":
x = np.random.randn(1_000_000)
t_loop = timeit.timeit(lambda: score_loop(x, 0.5), number=3) / 3
t_vec = timeit.timeit(lambda: score_vectorized(x, 0.5), number=10) / 10
print(f"Loop: {t_loop*1000:.1f} ms")
print(f"Vectorized: {t_vec*1000:.2f} ms")
print(f"Speedup: {t_loop/t_vec:.0f}x")
np.where(condition, a, b) is a ternary select executed in C. On a 1 M-element array you will typically see 50x–200x speedup because NumPy avoids Python interpreter overhead on every iteration and exploits SIMD instructions.
import numpy as np
def score_loop(x: np.ndarray, threshold: float) -> np.ndarray:
"""Reference implementation — Python loop, do not modify."""
result = np.empty_like(x, dtype=float)
for i in range(len(x)):
v = x[i]
if v > threshold:
result[i] = v * 2.0 - threshold
else:
result[i] = v * 0.5
return result
def score_vectorized(x: np.ndarray, threshold: float) -> np.ndarray:
"""
Vectorized equivalent of score_loop — no Python loop.
Use np.where (or boolean masking) to handle the branch.
"""
pass
Expected Output
vectorized result matches loop result; 10x+ speedupHints
Hint 1: Express the loop body as element-wise array operations.
Hint 2: np.where handles conditional logic without branching.
Hint 3: Profile with timeit to confirm the speedup.
