Python Indexing Practice Problems & Exercises
Practice: Indexing and Query Optimization
← Back to lessonCreate an index on a city column and verify its existence in the schema catalog.
Solution:
def create_city_index(conn):
conn.execute("CREATE INDEX idx_customers_city ON customers(city)")
conn.commit()
def list_indexes(conn, table):
cur = conn.execute("""
SELECT name FROM sqlite_master
WHERE type = 'index'
AND tbl_name = ?
AND name NOT LIKE 'sqlite_autoindex%'
ORDER BY name
""", (table,))
return [row[0] for row in cur.fetchall()]
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE customers (
id INTEGER PRIMARY KEY,
email TEXT UNIQUE,
name TEXT,
city TEXT
)
""")
# Insert 10,000 rows
import random, string
random.seed(42)
rows = [
(i, f'user{i}@example.com', f'User {i}',
random.choice(['NYC', 'LA', 'Chicago', 'Houston', 'Phoenix']))
for i in range(10000)
]
conn.executemany("INSERT INTO customers VALUES (?,?,?,?)", rows)
conn.commit()
# TODO: Create an index on the city column.
# Then verify the index exists using sqlite_master.
# Return a list of index names on the customers table.
def create_city_index(conn):
pass
def list_indexes(conn, table):
pass
create_city_index(conn)
print(list_indexes(conn, 'customers'))
Expected Output
['idx_customers_city']Hints
Hint 1: CREATE INDEX idx_customers_city ON customers(city) creates the index.
Hint 2: SELECT name FROM sqlite_master WHERE type='index' AND tbl_name=? lists indexes.
Hint 3: Exclude auto-created indexes (those that start with 'sqlite_autoindex') if needed.
Use EXPLAIN QUERY PLAN to observe the difference between a full table scan and an index seek.
Solution:
def check_query_plans(conn):
def get_plan(conn):
rows = conn.execute(
"EXPLAIN QUERY PLAN SELECT * FROM orders WHERE customer_id = 42"
).fetchall()
return ' '.join(str(col) for row in rows for col in row).upper()
before = get_plan(conn)
conn.execute("CREATE INDEX idx_orders_customer ON orders(customer_id)")
conn.commit()
after = get_plan(conn)
return before, after
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("CREATE TABLE orders (id INTEGER PRIMARY KEY, customer_id INTEGER, total REAL)")
# Insert sample data
conn.executemany("INSERT INTO orders VALUES (?,?,?)",
[(i, i % 100, float(i * 10)) for i in range(5000)])
conn.commit()
# TODO:
# 1. Run EXPLAIN QUERY PLAN on a query filtering by customer_id WITHOUT an index.
# Check if the plan says 'SCAN' (full table scan).
# 2. Create an index on customer_id.
# 3. Run EXPLAIN QUERY PLAN again.
# Check if the plan now says 'SEARCH' (index scan).
# Return (plan_before, plan_after) as strings.
def check_query_plans(conn):
pass
before, after = check_query_plans(conn)
print('Before index:', 'SCAN' in before.upper())
print('After index:', 'SEARCH' in after.upper() or 'INDEX' in after.upper())
Expected Output
Before index: True
After index: TrueHints
Hint 1: EXPLAIN QUERY PLAN SELECT ... returns rows describing the query plan.
Hint 2: Join all detail columns into a string to check for keywords like SCAN or SEARCH.
Hint 3: SQLite uses 'SCAN' for full table scans and 'SEARCH' for index seeks.
Create a unique index on an email column and verify it rejects duplicate values.
Solution:
def test_unique_email_index(conn):
conn.execute("CREATE UNIQUE INDEX idx_users_email ON users(email)")
conn.commit()
try:
conn.commit()
return False # Should not reach here
except sqlite3.IntegrityError:
return True
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE users (
id INTEGER PRIMARY KEY,
username TEXT,
email TEXT
)
""")
conn.commit()
# TODO: Create a unique index on the email column.
# Insert two users with different emails (should succeed).
# Try inserting a duplicate email (should raise IntegrityError).
# Return True if the duplicate was correctly rejected.
def test_unique_email_index(conn):
pass
print(test_unique_email_index(conn))
Expected Output
TrueHints
Hint 1: CREATE UNIQUE INDEX idx_users_email ON users(email) enforces uniqueness.
Hint 2: sqlite3.IntegrityError is raised when a UNIQUE constraint is violated.
Hint 3: Catch the exception to verify the constraint is working.
Drop a specific index and verify the remaining index set with sqlite_master queries.
Solution:
def manage_indexes(conn):
def get_indexes():
cur = conn.execute("""
SELECT name FROM sqlite_master
WHERE type = 'index' AND tbl_name = 'products'
AND name NOT LIKE 'sqlite_autoindex%'
""")
return [row[0] for row in cur.fetchall()]
before = get_indexes()
conn.execute("DROP INDEX idx_price")
conn.commit()
after = get_indexes()
return before, after
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("CREATE TABLE products (id INTEGER PRIMARY KEY, sku TEXT, price REAL)")
conn.execute("CREATE INDEX idx_sku ON products(sku)")
conn.execute("CREATE INDEX idx_price ON products(price)")
conn.executemany("INSERT INTO products VALUES (?,?,?)",
[(i, f'SKU-{i:04d}', float(i * 2)) for i in range(1000)])
conn.commit()
# TODO:
# 1. List current indexes (excluding auto-indexes)
# 2. Drop idx_price
# 3. List indexes again
# Return (before_list, after_list)
def manage_indexes(conn):
pass
before, after = manage_indexes(conn)
print('Before:', sorted(before))
print('After:', sorted(after))
Expected Output
Before: ['idx_price', 'idx_sku']
After: ['idx_sku']Hints
Hint 1: DROP INDEX idx_price removes the index without affecting table data.
Hint 2: Query sqlite_master WHERE type='index' AND tbl_name='products' to list indexes.
Hint 3: REINDEX products rebuilds all indexes on the table (useful after bulk loads).
Design a composite index with the correct column order to support an equality+range query efficiently.
Solution:
def create_optimal_composite_index(conn):
conn.execute("CREATE INDEX idx_logs_composite ON logs(service, level, ts)")
conn.commit()
def verify_plan(conn):
rows = conn.execute("""
EXPLAIN QUERY PLAN
SELECT * FROM logs
WHERE service = 'api' AND level = 'ERROR' AND ts > 1700040000
""").fetchall()
return ' '.join(str(col) for row in rows for col in row)
import sqlite3
import time
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE logs (
id INTEGER PRIMARY KEY,
service TEXT,
level TEXT,
ts INTEGER,
message TEXT
)
""")
import random
random.seed(1)
services = ['api', 'worker', 'scheduler']
levels = ['DEBUG', 'INFO', 'WARN', 'ERROR']
conn.executemany("INSERT INTO logs VALUES (?,?,?,?,?)", [
(i, random.choice(services), random.choice(levels), 1700000000 + i, f'msg {i}')
for i in range(50000)
])
conn.commit()
# TODO: Create a composite index that makes this query fast:
# SELECT * FROM logs WHERE service = ? AND level = ? AND ts > ?
# The correct column order matters for the leftmost-prefix rule.
# Return the EXPLAIN QUERY PLAN output to verify index use.
def create_optimal_composite_index(conn):
pass
def verify_plan(conn):
pass
create_optimal_composite_index(conn)
plan = verify_plan(conn)
print('Uses index:', 'SCAN TABLE' not in plan.upper() or 'USING INDEX' in plan.upper())
Expected Output
Uses index: TrueHints
Hint 1: Put equality columns first in the composite index: (service, level, ts).
Hint 2: The leftmost prefix rule means the index helps WHERE service=? AND level=? AND ts>?.
Hint 3: EXPLAIN QUERY PLAN should show SEARCH ... USING INDEX, not SCAN TABLE.
Create a covering index that allows the query engine to satisfy a SELECT entirely from the index.
Solution:
def create_covering_index(conn):
conn.execute("""
CREATE INDEX idx_orders_covering
ON orders(customer_id, status, total, created_at)
""")
conn.commit()
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE orders (
id INTEGER PRIMARY KEY,
customer_id INTEGER,
status TEXT,
total REAL,
created_at INTEGER
)
""")
import random
random.seed(2)
conn.executemany("INSERT INTO orders VALUES (?,?,?,?,?)", [
(i, random.randint(1, 1000),
random.choice(['pending', 'shipped', 'delivered', 'cancelled']),
random.uniform(10, 500),
1700000000 + i)
for i in range(100000)
])
conn.commit()
# The query: SELECT total, created_at FROM orders WHERE customer_id = ? AND status = 'shipped'
# TODO: Create a covering index that includes all needed columns so
# SQLite can satisfy the query entirely from the index (no table lookup).
# Verify with EXPLAIN QUERY PLAN.
def create_covering_index(conn):
pass
create_covering_index(conn)
plan_rows = conn.execute("""
EXPLAIN QUERY PLAN
SELECT total, created_at FROM orders
WHERE customer_id = 42 AND status = 'shipped'
""").fetchall()
plan = ' '.join(str(c) for row in plan_rows for c in row)
print('Covering index active:', 'COVERING INDEX' in plan.upper() or 'USING INDEX' in plan.upper())
Expected Output
Covering index active: TrueHints
Hint 1: A covering index includes all columns referenced in WHERE, SELECT, and ORDER BY.
Hint 2: CREATE INDEX idx ON orders(customer_id, status, total, created_at) covers the whole query.
Hint 3: SQLite shows 'USING COVERING INDEX' in the query plan when no table row fetch is needed.
Create a partial (filtered) index that only indexes active tasks, saving storage and improving query speed.
Solution:
def create_partial_index(conn):
conn.execute("""
CREATE INDEX idx_active_tasks_priority
ON tasks(priority)
WHERE status IN ('pending', 'in_progress')
""")
conn.commit()
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE tasks (
id INTEGER PRIMARY KEY,
title TEXT,
status TEXT, -- 'pending', 'in_progress', 'done'
priority INTEGER,
assigned_to INTEGER
)
""")
import random
random.seed(3)
statuses = ['pending'] * 1 + ['in_progress'] * 1 + ['done'] * 8 # 80% done
conn.executemany("INSERT INTO tasks VALUES (?,?,?,?,?)", [
(i, f'Task {i}', random.choice(statuses), random.randint(1, 5), random.randint(1, 50))
for i in range(100000)
])
conn.commit()
# Most queries filter active tasks: WHERE status IN ('pending', 'in_progress')
# Only ~20% of rows are active — a partial index saves space and speeds queries.
# TODO: Create a partial index only on active tasks.
# Then verify it's used when filtering for active tasks.
def create_partial_index(conn):
pass
create_partial_index(conn)
plan_rows = conn.execute("""
EXPLAIN QUERY PLAN
SELECT * FROM tasks
WHERE status = 'pending' AND priority >= 3
""").fetchall()
plan = ' '.join(str(c) for row in plan_rows for c in row)
print('Partial index used:', 'INDEX' in plan.upper())
Expected Output
Partial index used: TrueHints
Hint 1: CREATE INDEX ... ON tasks(priority) WHERE status IN ('pending', 'in_progress') is a partial index.
Hint 2: The WHERE clause on the index means only matching rows are indexed.
Hint 3: SQLite will use this index when the query's WHERE clause matches the index predicate.
Empirically measure the query time speedup from adding an index on a large table.
Solution:
import time
def benchmark_index(n_rows=100000):
conn = sqlite3.connect(':memory:')
conn.execute("CREATE TABLE events (id INTEGER PRIMARY KEY, user_id INTEGER, action TEXT)")
import random
random.seed(42)
conn.executemany("INSERT INTO events VALUES (?,?,?)", [
(i, random.randint(1, 10000), random.choice(['click', 'view', 'buy']))
for i in range(n_rows)
])
conn.commit()
def time_query(conn, iterations=5):
times = []
for _ in range(iterations):
t0 = time.perf_counter()
conn.execute(
"SELECT COUNT(*) FROM events WHERE user_id = 42 AND action = 'buy'"
).fetchone()
times.append(time.perf_counter() - t0)
return min(times) * 1000
without_ms = time_query(conn)
conn.execute("CREATE INDEX idx_events_user ON events(user_id, action)")
conn.commit()
with_ms = time_query(conn)
conn.close()
return without_ms, with_ms
import sqlite3
import time
# TODO: Create a table with 100,000 rows.
# Measure query time WITHOUT an index.
# Add an index.
# Measure query time WITH the index.
# Return (time_without_index, time_with_index) in milliseconds.
# The indexed query should be significantly faster.
def benchmark_index(n_rows=100000):
pass
without_ms, with_ms = benchmark_index()
print(f'Without index: {without_ms:.2f}ms')
print(f'With index: {with_ms:.2f}ms')
print(f'Speedup: {without_ms / with_ms:.1f}x faster')
Expected Output
Without index: X.XXms
With index: X.XXms
Speedup: X.Xx fasterHints
Hint 1: Use time.perf_counter() before and after the query for high-resolution timing.
Hint 2: Run the same SELECT WHERE query with and without the index.
Hint 3: Run each measurement multiple times and take the minimum to reduce noise.
Rewrite queries that apply functions to indexed columns, restoring index usability.
Solution:
def find_user_by_email(conn, email):
# Use COLLATE NOCASE — SQLite can still use a NOCASE index, or store as lower
cur = conn.execute(
"SELECT * FROM users WHERE email = ? COLLATE NOCASE",
(email,)
)
return cur.fetchall()
def find_january_signups(conn):
cur = conn.execute("""
SELECT * FROM users
WHERE signup_date >= '2024-01-01' AND signup_date < '2024-02-01'
ORDER BY signup_date
""")
return cur.fetchall()
import sqlite3
conn = sqlite3.connect(':memory:')
conn.execute("""
CREATE TABLE users (
id INTEGER PRIMARY KEY,
email TEXT,
signup_date TEXT -- stored as 'YYYY-MM-DD'
)
""")
conn.execute("CREATE INDEX idx_email ON users(email)")
conn.execute("CREATE INDEX idx_signup ON users(signup_date)")
conn.executemany("INSERT INTO users VALUES (?,?,?)", [
(1, '[email protected]', '2024-01-15'),
(2, '[email protected]', '2024-02-20'),
(3, '[email protected]', '2024-03-10'),
(4, '[email protected]', '2024-01-05'),
])
conn.commit()
# The following queries are "index killers" — they prevent index use.
# Bad: WHERE LOWER(email) = ? (function on indexed column)
# Bad: WHERE strftime('%Y', signup_date) = '2024' (function on indexed column)
# TODO: Rewrite both queries to use the index:
# 1. Find user with email '[email protected]' (case-insensitive, index-friendly)
# 2. Find users who signed up in January 2024 (range query, index-friendly)
def find_user_by_email(conn, email):
pass
def find_january_signups(conn):
pass
print(find_user_by_email(conn, '[email protected]'))
print(find_january_signups(conn))
Expected Output
[(1, '[email protected]', '2024-01-15')]
[(4, '[email protected]', '2024-01-05'), (1, '[email protected]', '2024-01-15')]Hints
Hint 1: Store emails in lowercase at insert time, so WHERE email = lower(?) can use the index.
Hint 2: For date ranges: WHERE signup_date >= '2024-01-01' AND signup_date < '2024-02-01'.
Hint 3: Using functions on the indexed column (LOWER(email)) prevents the index from being used.
Build a query plan analyzer that classifies each step as an index seek or a full table scan.
Solution:
def analyze_query(conn, sql, params=()):
plan_rows = conn.execute(f"EXPLAIN QUERY PLAN {sql}", params).fetchall()
details = [str(row[-1]) for row in plan_rows]
scan_count = sum(1 for d in details if 'SCAN' in d.upper() and 'INDEX' not in d.upper())
index_count = sum(1 for d in details if 'SEARCH' in d.upper() or 'USING INDEX' in d.upper() or 'COVERING INDEX' in d.upper())
return {
'uses_index': index_count > 0,
'scan_count': scan_count,
'index_count': index_count,
'plan_details': details,
}
import sqlite3
# TODO: Build a query analyzer that:
# 1. Runs EXPLAIN QUERY PLAN on a given SQL query
# 2. Reports whether each step uses an index or a full scan
# 3. Returns a dict with:
# - 'uses_index': bool
# - 'scan_count': number of full table scans
# - 'index_count': number of index seeks
# - 'plan_details': list of detail strings
def analyze_query(conn, sql, params=()):
pass
conn = sqlite3.connect(':memory:')
conn.execute("CREATE TABLE t (id INTEGER PRIMARY KEY, x INTEGER, y TEXT)")
conn.executemany("INSERT INTO t VALUES (?,?,?)", [(i, i*2, f'val{i}') for i in range(10000)])
conn.commit()
# Without index
result1 = analyze_query(conn, "SELECT * FROM t WHERE x = 100")
print('No index - uses_index:', result1['uses_index'])
# With index
conn.execute("CREATE INDEX idx_x ON t(x)")
result2 = analyze_query(conn, "SELECT * FROM t WHERE x = 100")
print('With index - uses_index:', result2['uses_index'])
Expected Output
No index - uses_index: False
With index - uses_index: TrueHints
Hint 1: EXPLAIN QUERY PLAN returns rows with columns: id, parent, notused, detail.
Hint 2: The 'detail' column contains strings like 'SCAN TABLE t' or 'SEARCH t USING INDEX'.
Hint 3: Check each detail string for keywords 'SCAN' vs 'SEARCH' or 'INDEX'.
Build an index advisor that analyzes a query workload and recommends CREATE INDEX statements for unindexed hot columns.
Solution:
import re
from collections import Counter
def index_advisor(conn, queries, top_n=3):
# Extract column names from WHERE clauses
col_pattern = re.compile(r'\bWHERE\b.*?\b(\w+)\s*[=><]', re.IGNORECASE)
and_pattern = re.compile(r'\bAND\b\s+(\w+)\s*[=><]', re.IGNORECASE)
col_counts = Counter()
for q in queries:
for m in col_pattern.finditer(q):
col_counts[m.group(1).lower()] += 1
for m in and_pattern.finditer(q):
col_counts[m.group(1).lower()] += 1
# Get already-indexed columns
indexed_cols = set()
rows = conn.execute("""
SELECT sql FROM sqlite_master
WHERE type = 'index' AND tbl_name = 'orders'
AND sql IS NOT NULL
""").fetchall()
for (sql,) in rows:
m = re.search(r'\((\w+)', sql)
if m:
indexed_cols.add(m.group(1).lower())
# Skip primary key and already indexed
skip = indexed_cols | {'id'}
recommendations = []
for col, _ in col_counts.most_common():
if col in skip:
continue
recommendations.append(f"CREATE INDEX idx_orders_{col} ON orders({col});")
if len(recommendations) >= top_n:
break
return recommendations
import sqlite3
import re
from collections import Counter
# TODO: Build a basic index advisor that:
# 1. Analyzes a list of SQL queries (looking for WHERE column = ?)
# 2. Counts how often each column appears in WHERE clauses
# 3. Checks which columns already have indexes
# 4. Recommends CREATE INDEX statements for the top N unindexed columns
def index_advisor(conn, queries, top_n=3):
pass
conn = sqlite3.connect(':memory:')
conn.execute("CREATE TABLE orders (id INTEGER PRIMARY KEY, customer_id INTEGER, status TEXT, region TEXT, amount REAL)")
conn.execute("CREATE INDEX idx_customer ON orders(customer_id)")
conn.commit()
workload = [
"SELECT * FROM orders WHERE customer_id = ?",
"SELECT * FROM orders WHERE status = ? AND customer_id = ?",
"SELECT * FROM orders WHERE region = ?",
"SELECT * FROM orders WHERE status = ?",
"SELECT * FROM orders WHERE region = ? AND status = ?",
"SELECT * FROM orders WHERE amount > ?",
"SELECT * FROM orders WHERE status = ?",
]
recommendations = index_advisor(conn, workload, top_n=3)
for rec in recommendations:
print(rec)
Expected Output
CREATE INDEX idx_orders_status ON orders(status);
CREATE INDEX idx_orders_region ON orders(region);
CREATE INDEX idx_orders_amount ON orders(amount);Hints
Hint 1: Use a regex like r'WHERE\s+(\w+)\s*=' to extract column names from queries.
Hint 2: Count column frequencies with collections.Counter.
Hint 3: Check existing indexes via sqlite_master, then recommend for unindexed high-frequency columns.
