Python Reading Files Practice Problems & Exercises
Practice: Reading Files
← Back to lessonEasy
Write a function read_entire_file(path) that opens a text file using a with statement and UTF-8 encoding, reads its entire content, and returns it as a single string.
This tests the most fundamental file reading pattern: with open() plus f.read().
import tempfile, os
# Setup: create a temporary file
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write("Hello, Python!\nWelcome to file reading.")
tmp.close()
filepath = tmp.name
def read_entire_file(path):
with open(path, "r", encoding="utf-8") as f:
return f.read()
# Test
content = read_entire_file(filepath)
print(repr(content))
print(type(content).__name__)
os.unlink(filepath)Solution
def read_entire_file(path):
with open(path, "r", encoding="utf-8") as f:
return f.read()
Key points:
with open()guarantees the file is closed even if an exception occurs.- Always specify
encoding="utf-8"explicitly — the default varies by platform. f.read()with no arguments reads the entire file into a singlestrobject.- This pattern is appropriate for small files. For large files, use iteration instead.
import tempfile, os
# Setup: create a temporary file
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write("Hello, Python!\nWelcome to file reading.")
tmp.close()
filepath = tmp.name
def read_entire_file(path):
# TODO: Open the file with UTF-8 encoding using a with statement.
# Read the entire content and return it as a string.
pass
# Test
content = read_entire_file(filepath)
print(repr(content))
print(type(content).__name__)
os.unlink(filepath)Expected Output
'Hello, Python!\nWelcome to file reading.'
strHints
Hint 1: Use 'with open(path, "r", encoding="utf-8") as f:' to open the file safely.
Hint 2: Call f.read() with no arguments to read the entire file into one string.
Write a function read_lines_clean(path) that reads all lines from a file and returns them as a list with trailing newlines removed.
This practices readlines() and the common pattern of stripping newline characters from each line.
import tempfile, os
# Setup
content = "apple\nbanana\ncherry\ndate\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def read_lines_clean(path):
with open(path, "r", encoding="utf-8") as f:
return [line.rstrip("\n") for line in f.readlines()]
# Test
lines = read_lines_clean(filepath)
print(lines)
print(len(lines))
os.unlink(filepath)Solution
def read_lines_clean(path):
with open(path, "r", encoding="utf-8") as f:
return [line.rstrip("\n") for line in f.readlines()]
Key points:
f.readlines()returns a list where each element is a line including its trailing\n.- Use
rstrip("\n")instead ofstrip()to only remove the trailing newline, preserving any leading whitespace. list(f)is equivalent tof.readlines()— both load all lines into memory.- For large files, prefer line-by-line iteration instead of
readlines().
import tempfile, os
# Setup
content = "apple\nbanana\ncherry\ndate\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def read_lines_clean(path):
# TODO: Open the file, read all lines, and return a list of lines
# with trailing newlines stripped.
# Example: ["apple", "banana", "cherry", "date"]
pass
# Test
lines = read_lines_clean(filepath)
print(lines)
print(len(lines))
os.unlink(filepath)Expected Output
['apple', 'banana', 'cherry', 'date']
4Hints
Hint 1: Use f.readlines() to get a list of lines, each ending with '\n'.
Hint 2: Use a list comprehension with line.rstrip('\n') to strip the trailing newlines.
Write a function count_lines_readline(path) that uses readline() in a while loop to count the total number of lines in a file. Return the count.
This tests your understanding of the critical difference between an empty line ("\n") and end-of-file ("").
import tempfile, os
# Setup
content = "first\nsecond\nthird\nfourth\nfifth\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def count_lines_readline(path):
count = 0
with open(path, "r", encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
count += 1
return count
# Test
count = count_lines_readline(filepath)
print(count)
os.unlink(filepath)Solution
def count_lines_readline(path):
count = 0
with open(path, "r", encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
count += 1
return count
Key points:
readline()returns""(empty string) at EOF — this is falsy, soif not linecatches it.- An empty line in the file returns
"\n"— this is truthy, so it is correctly counted. - This pattern is useful when you need to mix reading with other file operations (like
tell()). - For simple line counting,
for line in fwith a counter orsum(1 for _ in f)is more idiomatic.
import tempfile, os
# Setup
content = "first\nsecond\nthird\nfourth\nfifth\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def count_lines_readline(path):
# TODO: Use readline() in a while loop to count
# the number of lines in the file.
# Return the count as an integer.
# Remember: readline() returns "" (empty string) at EOF,
# but "\n" for an empty line.
pass
# Test
count = count_lines_readline(filepath)
print(count)
os.unlink(filepath)Expected Output
5Hints
Hint 1: Call f.readline() in a while loop. An empty string '' signals EOF.
Hint 2: A blank line in the file returns '\n' (not ''), so it should still be counted.
Write a function read_first_n(path, n) that opens a file and reads only the first n characters, returning them as a string.
This tests the read(size) parameter and understanding that in text mode, the argument is a character count, not a byte count.
import tempfile, os
# Setup
content = "Python is a powerful programming language."
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def read_first_n(path, n):
with open(path, "r", encoding="utf-8") as f:
return f.read(n)
# Test
print(read_first_n(filepath, 6))
print(read_first_n(filepath, 15))
print(len(read_first_n(filepath, 10)))
os.unlink(filepath)Solution
def read_first_n(path, n):
with open(path, "r", encoding="utf-8") as f:
return f.read(n)
Key points:
f.read(n)in text mode reads up toncharacters (Unicode code points), not bytes.- In binary mode (
"rb"),f.read(n)reads up tonbytes instead. - If the file has fewer than
ncharacters,read(n)returns whatever is available without error. - After
read(n), the file position advances byncharacters — subsequent reads continue from there.
import tempfile, os
# Setup
content = "Python is a powerful programming language."
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
def read_first_n(path, n):
# TODO: Open the file and read only the first n characters.
# Return the result as a string.
pass
# Test
print(read_first_n(filepath, 6))
print(read_first_n(filepath, 15))
print(len(read_first_n(filepath, 10)))
os.unlink(filepath)Expected Output
Python
Python is a pow
10Hints
Hint 1: Pass the number n to f.read(n) to read exactly n characters.
Hint 2: In text mode, read(n) reads n characters (not bytes).
Medium
Write a function find_lines(path, keyword) that reads a file line by line using iteration and returns a list of all lines containing the keyword (case-insensitive). Strip trailing newlines from the returned lines.
This tests the idiomatic for line in f pattern — the most memory-efficient way to process lines.
import tempfile, os
# Setup: a simulated log file
log_content = """[INFO] 2024-01-15 10:00:01 - Server started
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[INFO] 2024-01-15 10:00:06 - Retrying connection
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[WARNING] 2024-01-15 10:00:12 - High memory usage
[INFO] 2024-01-15 10:00:15 - Connection restored
[ERROR] 2024-01-15 10:00:20 - Disk space low
""".lstrip()
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write(log_content)
tmp.close()
filepath = tmp.name
def find_lines(path, keyword):
keyword_lower = keyword.lower()
results = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
if keyword_lower in line.lower():
results.append(line.rstrip("\n"))
return results
# Test
errors = find_lines(filepath, "ERROR")
for line in errors:
print(line)
print("---")
print(len(find_lines(filepath, "info")))
os.unlink(filepath)Solution
def find_lines(path, keyword):
keyword_lower = keyword.lower()
results = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
if keyword_lower in line.lower():
results.append(line.rstrip("\n"))
return results
Key points:
for line in fiterates using the file's internal buffer — only one line is in memory at a time.- Lowercase both sides once (
keyword_loweroutside the loop) to avoid redundant.lower()calls on the keyword. rstrip("\n")removes only the trailing newline, preserving any meaningful whitespace.- This pattern works on files of any size — memory usage is O(matching_lines), not O(total_lines).
import tempfile, os
# Setup: a simulated log file
log_content = """[INFO] 2024-01-15 10:00:01 - Server started
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[INFO] 2024-01-15 10:00:06 - Retrying connection
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[WARNING] 2024-01-15 10:00:12 - High memory usage
[INFO] 2024-01-15 10:00:15 - Connection restored
[ERROR] 2024-01-15 10:00:20 - Disk space low
""".lstrip()
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write(log_content)
tmp.close()
filepath = tmp.name
def find_lines(path, keyword):
# TODO: Read the file line by line (memory-efficient iteration).
# Return a list of lines that contain the keyword (case-insensitive).
# Strip trailing newlines from each returned line.
pass
# Test
errors = find_lines(filepath, "ERROR")
for line in errors:
print(line)
print("---")
print(len(find_lines(filepath, "info")))
os.unlink(filepath)Expected Output
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[ERROR] 2024-01-15 10:00:20 - Disk space low
---
3Hints
Hint 1: Use 'for line in f:' to iterate line by line — this is memory-efficient.
Hint 2: Convert both the line and keyword to lowercase for case-insensitive matching.
Hint 3: Use line.rstrip('\n') to remove the trailing newline before appending to the result list.
Write a function detect_file_type(path) that opens a file in binary mode, reads the first 8 bytes, and returns the file type based on magic bytes (file signatures):
- Starts with
b"\x89PNG"returns"PNG" - Starts with
b"%PDF"returns"PDF" - Starts with
b"\xff\xd8"returns"JPEG" - Otherwise returns
"UNKNOWN"
This tests binary mode reading and the real-world pattern of identifying file types by their header bytes.
import tempfile, os
# Setup: create fake files with known magic bytes
def write_fake_file(header_bytes):
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(header_bytes + b"\x00" * 100)
tmp.close()
return tmp.name
png_file = write_fake_file(b"\x89PNG\r\n\x1a\n")
pdf_file = write_fake_file(b"%PDF-1.4")
jpg_file = write_fake_file(b"\xff\xd8\xff\xe0")
unknown_file = write_fake_file(b"XXXX????")
def detect_file_type(path):
with open(path, "rb") as f:
header = f.read(8)
if header.startswith(b"\x89PNG"):
return "PNG"
if header.startswith(b"%PDF"):
return "PDF"
if header.startswith(b"\xff\xd8"):
return "JPEG"
return "UNKNOWN"
# Test
print(detect_file_type(png_file))
print(detect_file_type(pdf_file))
print(detect_file_type(jpg_file))
print(detect_file_type(unknown_file))
for f in [png_file, pdf_file, jpg_file, unknown_file]:
os.unlink(f)Solution
def detect_file_type(path):
with open(path, "rb") as f:
header = f.read(8)
if header.startswith(b"\x89PNG"):
return "PNG"
if header.startswith(b"%PDF"):
return "PDF"
if header.startswith(b"\xff\xd8"):
return "JPEG"
return "UNKNOWN"
Key points:
- Binary mode (
"rb") returnsbytes, notstr. No encoding/decoding is performed. f.read(8)reads exactly 8 bytes — enough to identify most common file formats.bytes.startswith()works with byte literals likeb"\x89PNG".- Real tools like the
filecommand on Unix use the same magic-byte approach with a database of thousands of signatures.
import tempfile, os
# Setup: create fake files with known magic bytes
def write_fake_file(header_bytes):
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(header_bytes + b"\x00" * 100)
tmp.close()
return tmp.name
png_file = write_fake_file(b"\x89PNG\r\n\x1a\n")
pdf_file = write_fake_file(b"%PDF-1.4")
jpg_file = write_fake_file(b"\xff\xd8\xff\xe0")
unknown_file = write_fake_file(b"XXXX????")
def detect_file_type(path):
# TODO: Open the file in binary mode.
# Read the first 8 bytes.
# Return the file type based on these rules:
# - Starts with b"\x89PNG" -> "PNG"
# - Starts with b"%PDF" -> "PDF"
# - Starts with b"\xff\xd8" -> "JPEG"
# - Otherwise -> "UNKNOWN"
pass
# Test
print(detect_file_type(png_file))
print(detect_file_type(pdf_file))
print(detect_file_type(jpg_file))
print(detect_file_type(unknown_file))
for f in [png_file, pdf_file, jpg_file, unknown_file]:
os.unlink(f)Expected Output
PNG
PDF
JPEG
UNKNOWNHints
Hint 1: Open with mode 'rb' for binary reading. Do NOT pass encoding.
Hint 2: Use f.read(8) to read the first 8 bytes as a bytes object.
Hint 3: Use header.startswith(b"\x89PNG") to check the magic bytes.
Write a function read_with_error_handling(path) that:
- First attempts to read a file as UTF-8 with
errors="strict"— catch theUnicodeDecodeErrorand print"Caught error: "followed by the error message. - Then re-reads the file with
errors="replace"and returns the content (bad bytes become the replacement character).
This tests your understanding of encoding errors and the errors parameter to open().
import tempfile, os
# Setup: write a file with mixed encodings (Latin-1 byte inside UTF-8 text)
tmp = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
tmp.write(b"Hello World\n")
tmp.write(b"caf\xe9 latt\xe9\n") # Latin-1 encoded e-acute
tmp.write(b"Price: 42 \x80\n") # 0x80 is invalid UTF-8
tmp.write(b"Goodbye\n")
tmp.close()
filepath = tmp.name
def read_with_error_handling(path):
try:
with open(path, "r", encoding="utf-8", errors="strict") as f:
f.read()
except UnicodeDecodeError as e:
print("Caught error:", e)
with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
# Test
content = read_with_error_handling(filepath)
print("---CONTENT---")
print(content)
os.unlink(filepath)Solution
def read_with_error_handling(path):
try:
with open(path, "r", encoding="utf-8", errors="strict") as f:
f.read()
except UnicodeDecodeError as e:
print("Caught error:", e)
with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read()
Key points:
errors="strict"(the default) raisesUnicodeDecodeErroron any byte sequence invalid in the specified encoding.errors="replace"substitutes each invalid byte with U+FFFD (the Unicode replacement character).- The byte
0xe9is valid Latin-1 (representing e-acute) but invalid as a UTF-8 continuation byte. - In production, use
errors="replace"for best-effort reading, or detect the encoding withchardet.detect()first.
import tempfile, os
# Setup: write a file with mixed encodings (Latin-1 byte inside UTF-8 text)
tmp = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
tmp.write(b"Hello World\n")
tmp.write(b"caf\xe9 latt\xe9\n") # Latin-1 encoded e-acute
tmp.write(b"Price: 42 \x80\n") # 0x80 is invalid UTF-8
tmp.write(b"Goodbye\n")
tmp.close()
filepath = tmp.name
def read_with_error_handling(path):
# TODO: Try reading the file as UTF-8 with errors="strict".
# Catch UnicodeDecodeError and print the error message.
# Then read again with errors="replace" and return the content.
pass
# Test
content = read_with_error_handling(filepath)
print("---CONTENT---")
print(content)
os.unlink(filepath)Expected Output
Caught error: 'utf-8' codec can't decode byte 0xe9 in position 15: invalid continuation byte
---CONTENT---
Hello World
caf� latt�
Price: 42 �
Goodbye
Hints
Hint 1: First, open with encoding='utf-8' and errors='strict' inside a try block.
Hint 2: Catch UnicodeDecodeError and print the error with a 'Caught error:' prefix.
Hint 3: Then open again with errors='replace' to substitute bad bytes with the replacement character.
Write a function parse_config(path) that reads a key=value config file and returns a dictionary. Skip blank lines and comment lines (starting with #). Strip whitespace from keys and values.
This is a real-world pattern used in dotenv files, INI configs, and settings files.
import tempfile, os
# Setup: a key=value config file with comments and blank lines
config_content = """# Database settings
DB_HOST=localhost
DB_PORT=5432
DB_NAME=myapp
# App settings
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
"""
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".conf", delete=False)
tmp.write(config_content)
tmp.close()
filepath = tmp.name
def parse_config(path):
config = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, value = line.partition("=")
config[key.strip()] = value.strip()
return config
# Test
config = parse_config(filepath)
for key in sorted(config):
print(key + "=" + config[key])
print("---")
print(len(config))
os.unlink(filepath)Solution
def parse_config(path):
config = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, value = line.partition("=")
config[key.strip()] = value.strip()
return config
Key points:
str.partition("=")splits on the first=only, returning(key, "=", value). This handles values containing=.str.split("=")would break on values likeSECRET_KEY=abc=def=123.- Skipping blank lines and comments with
continuekeeps the logic flat and readable. - This pattern is the foundation of Python's
dotenv,configparser, and many custom config loaders.
import tempfile, os
# Setup: a key=value config file with comments and blank lines
config_content = """# Database settings
DB_HOST=localhost
DB_PORT=5432
DB_NAME=myapp
# App settings
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
"""
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".conf", delete=False)
tmp.write(config_content)
tmp.close()
filepath = tmp.name
def parse_config(path):
# TODO: Read the file line by line.
# Skip empty lines and lines starting with '#'.
# Split each remaining line on '=' and build a dict.
# Strip whitespace from both key and value.
# Return the dict.
pass
# Test
config = parse_config(filepath)
for key in sorted(config):
print(key + "=" + config[key])
print("---")
print(len(config))
os.unlink(filepath)Expected Output
DB_HOST=localhost
DB_NAME=myapp
DB_PORT=5432
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
---
6Hints
Hint 1: Use line.strip() to remove whitespace and newlines, then check 'if not line or line.startswith("#")'.
Hint 2: Use line.partition('=') to split on the first '=' only — this handles values that contain '='.
Hint 3: Strip whitespace from both the key and value with .strip().
Hard
Write a function chunked_sha256(path, chunk_size=4096) that computes the SHA-256 hash of a file using chunked binary reads. The function must never load the entire file into memory at once.
This is the standard pattern for hashing large files — the same approach used by tools like sha256sum and backup verification systems.
import tempfile, os, hashlib
# Setup: create a file with known content
content = b"The quick brown fox jumps over the lazy dog" * 1000
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
# Compute expected hash for verification
expected = hashlib.sha256(content).hexdigest()
def chunked_sha256(path, chunk_size=4096):
hasher = hashlib.sha256()
with open(path, "rb") as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()
# Test
result = chunked_sha256(filepath)
print(result)
print(result == expected)
# Test with different chunk size
result2 = chunked_sha256(filepath, chunk_size=128)
print(result2 == expected)
os.unlink(filepath)Solution
def chunked_sha256(path, chunk_size=4096):
hasher = hashlib.sha256()
with open(path, "rb") as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()
Key points:
hashlib.sha256()creates a stateful hasher. Callingupdate(chunk)feeds bytes incrementally.- The hash result is identical regardless of chunk size —
update()is designed for streaming. - Memory usage is O(chunk_size), not O(file_size). A 10 GB file uses only 4 KB of buffer.
f.read(chunk_size)in binary mode returnsb""at EOF, which is falsy.- In production, use 64 KB or 256 KB chunks to reduce system call overhead on large files.
import tempfile, os, hashlib
# Setup: create a file with known content
content = b"The quick brown fox jumps over the lazy dog" * 1000
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name
# Compute expected hash for verification
expected = hashlib.sha256(content).hexdigest()
def chunked_sha256(path, chunk_size=4096):
# TODO: Compute the SHA-256 hash of a file using chunked reads.
# Open in binary mode, read chunk_size bytes at a time,
# feed each chunk to the hasher, and return the hex digest.
# This approach works on files of any size without loading
# the entire file into memory.
pass
# Test
result = chunked_sha256(filepath)
print(result)
print(result == expected)
# Test with different chunk size
result2 = chunked_sha256(filepath, chunk_size=128)
print(result2 == expected)
os.unlink(filepath)Expected Output
4f57a5b51edd0e1e8871e547e18a23b41bbf5ee4c006a61f48e8baed04f67d0c
True
TrueHints
Hint 1: Create a hashlib.sha256() hasher, then read in a while loop with f.read(chunk_size).
Hint 2: Break the loop when f.read() returns an empty bytes object b''.
Hint 3: Call hasher.update(chunk) for each chunk, then return hasher.hexdigest() at the end.
Write a function aggregate_log(path) that streams a log file and produces an aggregation dictionary. Each log line follows the format [LEVEL] YYYY-MM-DD HH:MM:SS - message.
Return a dict with:
"total_lines"— count of non-empty lines"level_counts"— dict mapping each level to its count"first_timestamp"and"last_timestamp"— the time range"error_messages"— list of message strings from ERROR lines only
The function must use streaming (no read() or readlines()).
import tempfile, os
from collections import Counter
# Setup: create a log file
log_lines = [
"[INFO] 2024-01-15 10:00:01 - Server started",
"[INFO] 2024-01-15 10:00:02 - Loading config",
"[WARNING] 2024-01-15 10:00:03 - Deprecated API used",
"[ERROR] 2024-01-15 10:00:05 - Database timeout",
"[INFO] 2024-01-15 10:00:06 - Retrying connection",
"[ERROR] 2024-01-15 10:00:10 - Query failed",
"[INFO] 2024-01-15 10:00:12 - Connection restored",
"[WARNING] 2024-01-15 10:00:15 - Slow query detected",
"[INFO] 2024-01-15 10:00:20 - Request processed",
"[ERROR] 2024-01-15 10:00:25 - Out of memory",
"[INFO] 2024-01-15 10:00:30 - Garbage collection",
"[INFO] 2024-01-15 10:00:35 - Health check OK",
]
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write("\n".join(log_lines) + "\n")
tmp.close()
filepath = tmp.name
def aggregate_log(path):
total = 0
level_counts = {}
first_ts = None
last_ts = None
error_messages = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
if not line:
continue
total += 1
bracket_end = line.index("]")
level = line[1:bracket_end]
rest = line[bracket_end + 2:]
timestamp_part, _, message = rest.partition(" - ")
timestamp = timestamp_part.strip()
level_counts[level] = level_counts.get(level, 0) + 1
if first_ts is None:
first_ts = timestamp
last_ts = timestamp
if level == "ERROR":
error_messages.append(message)
return {
"total_lines": total,
"level_counts": level_counts,
"first_timestamp": first_ts,
"last_timestamp": last_ts,
"error_messages": error_messages,
}
# Test
result = aggregate_log(filepath)
print(result["total_lines"])
print(sorted(result["level_counts"].items()))
print(result["first_timestamp"])
print(result["last_timestamp"])
for msg in result["error_messages"]:
print("ERROR:", msg)
os.unlink(filepath)Solution
def aggregate_log(path):
total = 0
level_counts = {}
first_ts = None
last_ts = None
error_messages = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
if not line:
continue
total += 1
bracket_end = line.index("]")
level = line[1:bracket_end]
rest = line[bracket_end + 2:]
timestamp_part, _, message = rest.partition(" - ")
timestamp = timestamp_part.strip()
level_counts[level] = level_counts.get(level, 0) + 1
if first_ts is None:
first_ts = timestamp
last_ts = timestamp
if level == "ERROR":
error_messages.append(message)
return {
"total_lines": total,
"level_counts": level_counts,
"first_timestamp": first_ts,
"last_timestamp": last_ts,
"error_messages": error_messages,
}
Key points:
- Single-pass streaming: the file is read line by line with
for line in f, using O(1) memory per line. line.index("]")finds the closing bracket.line[1:bracket_end]extracts the level string.str.partition(" - ")splits on the first occurrence only, safely handling messages that contain-.dict.get(level, 0) + 1is a clean pattern for counting withoutdefaultdictorCounter.- This approach scales to multi-gigabyte log files with constant memory usage.
import tempfile, os
from collections import Counter
# Setup: create a log file
log_lines = [
"[INFO] 2024-01-15 10:00:01 - Server started",
"[INFO] 2024-01-15 10:00:02 - Loading config",
"[WARNING] 2024-01-15 10:00:03 - Deprecated API used",
"[ERROR] 2024-01-15 10:00:05 - Database timeout",
"[INFO] 2024-01-15 10:00:06 - Retrying connection",
"[ERROR] 2024-01-15 10:00:10 - Query failed",
"[INFO] 2024-01-15 10:00:12 - Connection restored",
"[WARNING] 2024-01-15 10:00:15 - Slow query detected",
"[INFO] 2024-01-15 10:00:20 - Request processed",
"[ERROR] 2024-01-15 10:00:25 - Out of memory",
"[INFO] 2024-01-15 10:00:30 - Garbage collection",
"[INFO] 2024-01-15 10:00:35 - Health check OK",
]
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write("\n".join(log_lines) + "\n")
tmp.close()
filepath = tmp.name
def aggregate_log(path):
# TODO: Stream the log file line by line.
# Each line has format: [LEVEL] YYYY-MM-DD HH:MM:SS - message
#
# Return a dict with:
# "total_lines": int - total number of non-empty lines
# "level_counts": dict - mapping level string to count
# "first_timestamp": str - first timestamp seen (YYYY-MM-DD HH:MM:SS)
# "last_timestamp": str - last timestamp seen
# "error_messages": list - list of message strings from ERROR lines only
#
# Must work in constant memory (aside from error_messages list).
pass
# Test
result = aggregate_log(filepath)
print(result["total_lines"])
print(sorted(result["level_counts"].items()))
print(result["first_timestamp"])
print(result["last_timestamp"])
for msg in result["error_messages"]:
print("ERROR:", msg)
os.unlink(filepath)Expected Output
12
[('ERROR', 3), ('INFO', 7), ('WARNING', 2)]
2024-01-15 10:00:01
2024-01-15 10:00:35
ERROR: Database timeout
ERROR: Query failed
ERROR: Out of memoryHints
Hint 1: Parse each line: extract the level between [ and ], then split the rest to get timestamp and message.
Hint 2: Use line[1:line.index(']')] to get the level. The rest after '] ' contains timestamp and message.
Hint 3: Split the rest on ' - ' with maxsplit=1 to separate the timestamp portion from the message.
Write a function smart_read(path, encodings=None) that attempts to read a file using multiple encodings in order.
Default encodings: ["utf-8", "latin-1", "ascii"]. For each encoding, try reading with errors="strict". On success, return (content, encoding_used). If all fail, raise ValueError.
This tests understanding of encoding fallback strategies. Note the subtle behavior: Latin-1 accepts every possible byte, so it never fails — making encoding order critical.
import tempfile, os
# Setup: create files with different encodings
utf8_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
utf8_file.write("Hello caf\u00e9 \u2014 world".encode("utf-8"))
utf8_file.close()
latin1_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
latin1_file.write("Hello caf\u00e9 world".encode("latin-1"))
latin1_file.close()
ascii_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
ascii_file.write(b"Hello plain world")
ascii_file.close()
def smart_read(path, encodings=None):
if encodings is None:
encodings = ["utf-8", "latin-1", "ascii"]
for enc in encodings:
try:
with open(path, "r", encoding=enc, errors="strict") as f:
content = f.read()
return (content, enc)
except (UnicodeDecodeError, ValueError):
continue
raise ValueError("Could not decode file with any of the provided encodings")
# Test
content1, enc1 = smart_read(utf8_file.name)
print(enc1 + ":", repr(content1))
content2, enc2 = smart_read(latin1_file.name)
print(enc2 + ":", repr(content2))
content3, enc3 = smart_read(ascii_file.name)
print(enc3 + ":", repr(content3))
# Test with custom encoding order
content4, enc4 = smart_read(latin1_file.name, encodings=["ascii", "latin-1"])
print(enc4 + ":", repr(content4))
for f in [utf8_file.name, latin1_file.name, ascii_file.name]:
os.unlink(f)Solution
def smart_read(path, encodings=None):
if encodings is None:
encodings = ["utf-8", "latin-1", "ascii"]
for enc in encodings:
try:
with open(path, "r", encoding=enc, errors="strict") as f:
content = f.read()
return (content, enc)
except (UnicodeDecodeError, ValueError):
continue
raise ValueError("Could not decode file with any of the provided encodings")
Key points:
- Encoding order matters critically. Latin-1 (ISO-8859-1) maps every byte 0x00-0xFF to a character, so it never raises
UnicodeDecodeError. If listed first, it would always "succeed" — even on UTF-8 files, producing garbled output. - UTF-8 should be tried first because it is the most common modern encoding and has strict validation (invalid byte sequences are rejected).
- The Latin-1 file read as UTF-8 returns garbled text (the bytes are valid UTF-8 sequences that decode to wrong characters). This demonstrates why encoding detection is inherently imperfect.
- Production tools like
chardetuse statistical analysis of byte patterns to guess encodings with confidence scores. - Catching both
UnicodeDecodeErrorandValueErrorhandles edge cases with invalid encoding names.
import tempfile, os
# Setup: create files with different encodings
utf8_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
utf8_file.write("Hello café — world".encode("utf-8"))
utf8_file.close()
latin1_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
latin1_file.write("Hello café world".encode("latin-1"))
latin1_file.close()
ascii_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
ascii_file.write(b"Hello plain world")
ascii_file.close()
def smart_read(path, encodings=None):
# TODO: Try to read the file using each encoding in the list, in order.
# If encodings is None, default to ["utf-8", "latin-1", "ascii"].
# For each encoding, attempt to read with errors="strict".
# Return a tuple of (content_string, encoding_used).
# If the encoding works, return immediately.
# If all encodings fail, raise a ValueError with the message
# "Could not decode file with any of the provided encodings".
pass
# Test
content1, enc1 = smart_read(utf8_file.name)
print(enc1 + ":", repr(content1))
content2, enc2 = smart_read(latin1_file.name)
print(enc2 + ":", repr(content2))
content3, enc3 = smart_read(ascii_file.name)
print(enc3 + ":", repr(content3))
# Test with custom encoding order
content4, enc4 = smart_read(latin1_file.name, encodings=["ascii", "latin-1"])
print(enc4 + ":", repr(content4))
for f in [utf8_file.name, latin1_file.name, ascii_file.name]:
os.unlink(f)Expected Output
utf-8: 'Hello café — world'
utf-8: 'Hello café world'
utf-8: 'Hello plain world'
latin-1: 'Hello café world'Hints
Hint 1: Use a for loop over the encodings list. Inside, try opening with errors='strict'.
Hint 2: Catch UnicodeDecodeError and continue to the next encoding.
Hint 3: If the read succeeds, return (content, encoding) immediately.
Hint 4: Note: Latin-1 never raises UnicodeDecodeError because every byte 0x00-0xFF is valid in Latin-1.
