Skip to main content

Python Reading Files Practice Problems & Exercises

Practice: Reading Files

11 problems4 Easy4 Medium3 Hard40-55 min
← Back to lesson

Easy

#1Read Entire FileEasy
open()read()with statement

Write a function read_entire_file(path) that opens a text file using a with statement and UTF-8 encoding, reads its entire content, and returns it as a single string.

This tests the most fundamental file reading pattern: with open() plus f.read().

Python
import tempfile, os

# Setup: create a temporary file
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write("Hello, Python!\nWelcome to file reading.")
tmp.close()
filepath = tmp.name


def read_entire_file(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


# Test
content = read_entire_file(filepath)
print(repr(content))
print(type(content).__name__)

os.unlink(filepath)
Solution
def read_entire_file(path):
with open(path, "r", encoding="utf-8") as f:
return f.read()

Key points:

  • with open() guarantees the file is closed even if an exception occurs.
  • Always specify encoding="utf-8" explicitly — the default varies by platform.
  • f.read() with no arguments reads the entire file into a single str object.
  • This pattern is appropriate for small files. For large files, use iteration instead.
import tempfile, os

# Setup: create a temporary file
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write("Hello, Python!\nWelcome to file reading.")
tmp.close()
filepath = tmp.name


def read_entire_file(path):
  # TODO: Open the file with UTF-8 encoding using a with statement.
  # Read the entire content and return it as a string.
  pass


# Test
content = read_entire_file(filepath)
print(repr(content))
print(type(content).__name__)

os.unlink(filepath)
Expected Output
'Hello, Python!\nWelcome to file reading.'
str
Hints

Hint 1: Use 'with open(path, "r", encoding="utf-8") as f:' to open the file safely.

Hint 2: Call f.read() with no arguments to read the entire file into one string.

#2Read Lines Into a ListEasy
readlines()newline strippinglist processing

Write a function read_lines_clean(path) that reads all lines from a file and returns them as a list with trailing newlines removed.

This practices readlines() and the common pattern of stripping newline characters from each line.

Python
import tempfile, os

# Setup
content = "apple\nbanana\ncherry\ndate\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def read_lines_clean(path):
    with open(path, "r", encoding="utf-8") as f:
        return [line.rstrip("\n") for line in f.readlines()]


# Test
lines = read_lines_clean(filepath)
print(lines)
print(len(lines))

os.unlink(filepath)
Solution
def read_lines_clean(path):
with open(path, "r", encoding="utf-8") as f:
return [line.rstrip("\n") for line in f.readlines()]

Key points:

  • f.readlines() returns a list where each element is a line including its trailing \n.
  • Use rstrip("\n") instead of strip() to only remove the trailing newline, preserving any leading whitespace.
  • list(f) is equivalent to f.readlines() — both load all lines into memory.
  • For large files, prefer line-by-line iteration instead of readlines().
import tempfile, os

# Setup
content = "apple\nbanana\ncherry\ndate\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def read_lines_clean(path):
  # TODO: Open the file, read all lines, and return a list of lines
  # with trailing newlines stripped.
  # Example: ["apple", "banana", "cherry", "date"]
  pass


# Test
lines = read_lines_clean(filepath)
print(lines)
print(len(lines))

os.unlink(filepath)
Expected Output
['apple', 'banana', 'cherry', 'date']
4
Hints

Hint 1: Use f.readlines() to get a list of lines, each ending with '\n'.

Hint 2: Use a list comprehension with line.rstrip('\n') to strip the trailing newlines.

#3Count Lines with readline()Easy
readline()EOF detectionwhile loop

Write a function count_lines_readline(path) that uses readline() in a while loop to count the total number of lines in a file. Return the count.

This tests your understanding of the critical difference between an empty line ("\n") and end-of-file ("").

Python
import tempfile, os

# Setup
content = "first\nsecond\nthird\nfourth\nfifth\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def count_lines_readline(path):
    count = 0
    with open(path, "r", encoding="utf-8") as f:
        while True:
            line = f.readline()
            if not line:
                break
            count += 1
    return count


# Test
count = count_lines_readline(filepath)
print(count)

os.unlink(filepath)
Solution
def count_lines_readline(path):
count = 0
with open(path, "r", encoding="utf-8") as f:
while True:
line = f.readline()
if not line:
break
count += 1
return count

Key points:

  • readline() returns "" (empty string) at EOF — this is falsy, so if not line catches it.
  • An empty line in the file returns "\n" — this is truthy, so it is correctly counted.
  • This pattern is useful when you need to mix reading with other file operations (like tell()).
  • For simple line counting, for line in f with a counter or sum(1 for _ in f) is more idiomatic.
import tempfile, os

# Setup
content = "first\nsecond\nthird\nfourth\nfifth\n"
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def count_lines_readline(path):
  # TODO: Use readline() in a while loop to count
  # the number of lines in the file.
  # Return the count as an integer.
  # Remember: readline() returns "" (empty string) at EOF,
  # but "\n" for an empty line.
  pass


# Test
count = count_lines_readline(filepath)
print(count)

os.unlink(filepath)
Expected Output
5
Hints

Hint 1: Call f.readline() in a while loop. An empty string '' signals EOF.

Hint 2: A blank line in the file returns '\n' (not ''), so it should still be counted.

#4Read First N CharactersEasy
read(n)partial readseek()

Write a function read_first_n(path, n) that opens a file and reads only the first n characters, returning them as a string.

This tests the read(size) parameter and understanding that in text mode, the argument is a character count, not a byte count.

Python
import tempfile, os

# Setup
content = "Python is a powerful programming language."
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def read_first_n(path, n):
    with open(path, "r", encoding="utf-8") as f:
        return f.read(n)


# Test
print(read_first_n(filepath, 6))
print(read_first_n(filepath, 15))
print(len(read_first_n(filepath, 10)))

os.unlink(filepath)
Solution
def read_first_n(path, n):
with open(path, "r", encoding="utf-8") as f:
return f.read(n)

Key points:

  • f.read(n) in text mode reads up to n characters (Unicode code points), not bytes.
  • In binary mode ("rb"), f.read(n) reads up to n bytes instead.
  • If the file has fewer than n characters, read(n) returns whatever is available without error.
  • After read(n), the file position advances by n characters — subsequent reads continue from there.
import tempfile, os

# Setup
content = "Python is a powerful programming language."
tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".txt", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name


def read_first_n(path, n):
  # TODO: Open the file and read only the first n characters.
  # Return the result as a string.
  pass


# Test
print(read_first_n(filepath, 6))
print(read_first_n(filepath, 15))
print(len(read_first_n(filepath, 10)))

os.unlink(filepath)
Expected Output
Python
Python is a pow
10
Hints

Hint 1: Pass the number n to f.read(n) to read exactly n characters.

Hint 2: In text mode, read(n) reads n characters (not bytes).


Medium

#5Filter Lines by KeywordMedium
iterationline filteringmemory-efficient

Write a function find_lines(path, keyword) that reads a file line by line using iteration and returns a list of all lines containing the keyword (case-insensitive). Strip trailing newlines from the returned lines.

This tests the idiomatic for line in f pattern — the most memory-efficient way to process lines.

Python
import tempfile, os

# Setup: a simulated log file
log_content = """[INFO] 2024-01-15 10:00:01 - Server started
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[INFO] 2024-01-15 10:00:06 - Retrying connection
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[WARNING] 2024-01-15 10:00:12 - High memory usage
[INFO] 2024-01-15 10:00:15 - Connection restored
[ERROR] 2024-01-15 10:00:20 - Disk space low
""".lstrip()

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write(log_content)
tmp.close()
filepath = tmp.name


def find_lines(path, keyword):
    keyword_lower = keyword.lower()
    results = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if keyword_lower in line.lower():
                results.append(line.rstrip("\n"))
    return results


# Test
errors = find_lines(filepath, "ERROR")
for line in errors:
    print(line)
print("---")
print(len(find_lines(filepath, "info")))

os.unlink(filepath)
Solution
def find_lines(path, keyword):
keyword_lower = keyword.lower()
results = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
if keyword_lower in line.lower():
results.append(line.rstrip("\n"))
return results

Key points:

  • for line in f iterates using the file's internal buffer — only one line is in memory at a time.
  • Lowercase both sides once (keyword_lower outside the loop) to avoid redundant .lower() calls on the keyword.
  • rstrip("\n") removes only the trailing newline, preserving any meaningful whitespace.
  • This pattern works on files of any size — memory usage is O(matching_lines), not O(total_lines).
import tempfile, os

# Setup: a simulated log file
log_content = """[INFO] 2024-01-15 10:00:01 - Server started
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[INFO] 2024-01-15 10:00:06 - Retrying connection
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[WARNING] 2024-01-15 10:00:12 - High memory usage
[INFO] 2024-01-15 10:00:15 - Connection restored
[ERROR] 2024-01-15 10:00:20 - Disk space low
""".lstrip()

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write(log_content)
tmp.close()
filepath = tmp.name


def find_lines(path, keyword):
  # TODO: Read the file line by line (memory-efficient iteration).
  # Return a list of lines that contain the keyword (case-insensitive).
  # Strip trailing newlines from each returned line.
  pass


# Test
errors = find_lines(filepath, "ERROR")
for line in errors:
  print(line)
print("---")
print(len(find_lines(filepath, "info")))

os.unlink(filepath)
Expected Output
[ERROR] 2024-01-15 10:00:05 - Database connection failed
[ERROR] 2024-01-15 10:00:10 - Timeout on query
[ERROR] 2024-01-15 10:00:20 - Disk space low
---
3
Hints

Hint 1: Use 'for line in f:' to iterate line by line — this is memory-efficient.

Hint 2: Convert both the line and keyword to lowercase for case-insensitive matching.

Hint 3: Use line.rstrip('\n') to remove the trailing newline before appending to the result list.

#6Binary Header InspectorMedium
binary moderbbytesfile signatures

Write a function detect_file_type(path) that opens a file in binary mode, reads the first 8 bytes, and returns the file type based on magic bytes (file signatures):

  • Starts with b"\x89PNG" returns "PNG"
  • Starts with b"%PDF" returns "PDF"
  • Starts with b"\xff\xd8" returns "JPEG"
  • Otherwise returns "UNKNOWN"

This tests binary mode reading and the real-world pattern of identifying file types by their header bytes.

Python
import tempfile, os

# Setup: create fake files with known magic bytes
def write_fake_file(header_bytes):
    tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
    tmp.write(header_bytes + b"\x00" * 100)
    tmp.close()
    return tmp.name

png_file = write_fake_file(b"\x89PNG\r\n\x1a\n")
pdf_file = write_fake_file(b"%PDF-1.4")
jpg_file = write_fake_file(b"\xff\xd8\xff\xe0")
unknown_file = write_fake_file(b"XXXX????")


def detect_file_type(path):
    with open(path, "rb") as f:
        header = f.read(8)
    if header.startswith(b"\x89PNG"):
        return "PNG"
    if header.startswith(b"%PDF"):
        return "PDF"
    if header.startswith(b"\xff\xd8"):
        return "JPEG"
    return "UNKNOWN"


# Test
print(detect_file_type(png_file))
print(detect_file_type(pdf_file))
print(detect_file_type(jpg_file))
print(detect_file_type(unknown_file))

for f in [png_file, pdf_file, jpg_file, unknown_file]:
    os.unlink(f)
Solution
def detect_file_type(path):
with open(path, "rb") as f:
header = f.read(8)
if header.startswith(b"\x89PNG"):
return "PNG"
if header.startswith(b"%PDF"):
return "PDF"
if header.startswith(b"\xff\xd8"):
return "JPEG"
return "UNKNOWN"

Key points:

  • Binary mode ("rb") returns bytes, not str. No encoding/decoding is performed.
  • f.read(8) reads exactly 8 bytes — enough to identify most common file formats.
  • bytes.startswith() works with byte literals like b"\x89PNG".
  • Real tools like the file command on Unix use the same magic-byte approach with a database of thousands of signatures.
import tempfile, os

# Setup: create fake files with known magic bytes
def write_fake_file(header_bytes):
  tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
  tmp.write(header_bytes + b"\x00" * 100)
  tmp.close()
  return tmp.name

png_file = write_fake_file(b"\x89PNG\r\n\x1a\n")
pdf_file = write_fake_file(b"%PDF-1.4")
jpg_file = write_fake_file(b"\xff\xd8\xff\xe0")
unknown_file = write_fake_file(b"XXXX????")


def detect_file_type(path):
  # TODO: Open the file in binary mode.
  # Read the first 8 bytes.
  # Return the file type based on these rules:
  #   - Starts with b"\x89PNG" -> "PNG"
  #   - Starts with b"%PDF"   -> "PDF"
  #   - Starts with b"\xff\xd8" -> "JPEG"
  #   - Otherwise             -> "UNKNOWN"
  pass


# Test
print(detect_file_type(png_file))
print(detect_file_type(pdf_file))
print(detect_file_type(jpg_file))
print(detect_file_type(unknown_file))

for f in [png_file, pdf_file, jpg_file, unknown_file]:
  os.unlink(f)
Expected Output
PNG
PDF
JPEG
UNKNOWN
Hints

Hint 1: Open with mode 'rb' for binary reading. Do NOT pass encoding.

Hint 2: Use f.read(8) to read the first 8 bytes as a bytes object.

Hint 3: Use header.startswith(b"\x89PNG") to check the magic bytes.

#7Encoding Error HandlerMedium
encodingerrors parameterUnicodeDecodeError

Write a function read_with_error_handling(path) that:

  1. First attempts to read a file as UTF-8 with errors="strict" — catch the UnicodeDecodeError and print "Caught error: " followed by the error message.
  2. Then re-reads the file with errors="replace" and returns the content (bad bytes become the replacement character).

This tests your understanding of encoding errors and the errors parameter to open().

Python
import tempfile, os

# Setup: write a file with mixed encodings (Latin-1 byte inside UTF-8 text)
tmp = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
tmp.write(b"Hello World\n")
tmp.write(b"caf\xe9 latt\xe9\n")       # Latin-1 encoded e-acute
tmp.write(b"Price: 42 \x80\n")          # 0x80 is invalid UTF-8
tmp.write(b"Goodbye\n")
tmp.close()
filepath = tmp.name


def read_with_error_handling(path):
    try:
        with open(path, "r", encoding="utf-8", errors="strict") as f:
            f.read()
    except UnicodeDecodeError as e:
        print("Caught error:", e)

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        return f.read()


# Test
content = read_with_error_handling(filepath)
print("---CONTENT---")
print(content)

os.unlink(filepath)
Solution
def read_with_error_handling(path):
try:
with open(path, "r", encoding="utf-8", errors="strict") as f:
f.read()
except UnicodeDecodeError as e:
print("Caught error:", e)

with open(path, "r", encoding="utf-8", errors="replace") as f:
return f.read()

Key points:

  • errors="strict" (the default) raises UnicodeDecodeError on any byte sequence invalid in the specified encoding.
  • errors="replace" substitutes each invalid byte with U+FFFD (the Unicode replacement character).
  • The byte 0xe9 is valid Latin-1 (representing e-acute) but invalid as a UTF-8 continuation byte.
  • In production, use errors="replace" for best-effort reading, or detect the encoding with chardet.detect() first.
import tempfile, os

# Setup: write a file with mixed encodings (Latin-1 byte inside UTF-8 text)
tmp = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
tmp.write(b"Hello World\n")
tmp.write(b"caf\xe9 latt\xe9\n")       # Latin-1 encoded e-acute
tmp.write(b"Price: 42 \x80\n")          # 0x80 is invalid UTF-8
tmp.write(b"Goodbye\n")
tmp.close()
filepath = tmp.name


def read_with_error_handling(path):
  # TODO: Try reading the file as UTF-8 with errors="strict".
  # Catch UnicodeDecodeError and print the error message.
  # Then read again with errors="replace" and return the content.
  pass


# Test
content = read_with_error_handling(filepath)
print("---CONTENT---")
print(content)

os.unlink(filepath)
Expected Output
Caught error: 'utf-8' codec can't decode byte 0xe9 in position 15: invalid continuation byte
---CONTENT---
Hello World
caf� latt�
Price: 42 �
Goodbye
Hints

Hint 1: First, open with encoding='utf-8' and errors='strict' inside a try block.

Hint 2: Catch UnicodeDecodeError and print the error with a 'Caught error:' prefix.

Hint 3: Then open again with errors='replace' to substitute bad bytes with the replacement character.

#8Config File ParserMedium
iterationstring parsingreal-world pattern

Write a function parse_config(path) that reads a key=value config file and returns a dictionary. Skip blank lines and comment lines (starting with #). Strip whitespace from keys and values.

This is a real-world pattern used in dotenv files, INI configs, and settings files.

Python
import tempfile, os

# Setup: a key=value config file with comments and blank lines
config_content = """# Database settings
DB_HOST=localhost
DB_PORT=5432
DB_NAME=myapp

# App settings
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
"""

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".conf", delete=False)
tmp.write(config_content)
tmp.close()
filepath = tmp.name


def parse_config(path):
    config = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            key, _, value = line.partition("=")
            config[key.strip()] = value.strip()
    return config


# Test
config = parse_config(filepath)
for key in sorted(config):
    print(key + "=" + config[key])
print("---")
print(len(config))

os.unlink(filepath)
Solution
def parse_config(path):
config = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
key, _, value = line.partition("=")
config[key.strip()] = value.strip()
return config

Key points:

  • str.partition("=") splits on the first = only, returning (key, "=", value). This handles values containing =.
  • str.split("=") would break on values like SECRET_KEY=abc=def=123.
  • Skipping blank lines and comments with continue keeps the logic flat and readable.
  • This pattern is the foundation of Python's dotenv, configparser, and many custom config loaders.
import tempfile, os

# Setup: a key=value config file with comments and blank lines
config_content = """# Database settings
DB_HOST=localhost
DB_PORT=5432
DB_NAME=myapp

# App settings
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
"""

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".conf", delete=False)
tmp.write(config_content)
tmp.close()
filepath = tmp.name


def parse_config(path):
  # TODO: Read the file line by line.
  # Skip empty lines and lines starting with '#'.
  # Split each remaining line on '=' and build a dict.
  # Strip whitespace from both key and value.
  # Return the dict.
  pass


# Test
config = parse_config(filepath)
for key in sorted(config):
  print(key + "=" + config[key])
print("---")
print(len(config))

os.unlink(filepath)
Expected Output
DB_HOST=localhost
DB_NAME=myapp
DB_PORT=5432
DEBUG=true
LOG_LEVEL=info
MAX_RETRIES=3
---
6
Hints

Hint 1: Use line.strip() to remove whitespace and newlines, then check 'if not line or line.startswith("#")'.

Hint 2: Use line.partition('=') to split on the first '=' only — this handles values that contain '='.

Hint 3: Strip whitespace from both the key and value with .strip().


Hard

#9Chunked Binary HasherHard
binary modechunked readinglarge fileshashlib

Write a function chunked_sha256(path, chunk_size=4096) that computes the SHA-256 hash of a file using chunked binary reads. The function must never load the entire file into memory at once.

This is the standard pattern for hashing large files — the same approach used by tools like sha256sum and backup verification systems.

Python
import tempfile, os, hashlib

# Setup: create a file with known content
content = b"The quick brown fox jumps over the lazy dog" * 1000
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name

# Compute expected hash for verification
expected = hashlib.sha256(content).hexdigest()


def chunked_sha256(path, chunk_size=4096):
    hasher = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            hasher.update(chunk)
    return hasher.hexdigest()


# Test
result = chunked_sha256(filepath)
print(result)
print(result == expected)

# Test with different chunk size
result2 = chunked_sha256(filepath, chunk_size=128)
print(result2 == expected)

os.unlink(filepath)
Solution
def chunked_sha256(path, chunk_size=4096):
hasher = hashlib.sha256()
with open(path, "rb") as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()

Key points:

  • hashlib.sha256() creates a stateful hasher. Calling update(chunk) feeds bytes incrementally.
  • The hash result is identical regardless of chunk size — update() is designed for streaming.
  • Memory usage is O(chunk_size), not O(file_size). A 10 GB file uses only 4 KB of buffer.
  • f.read(chunk_size) in binary mode returns b"" at EOF, which is falsy.
  • In production, use 64 KB or 256 KB chunks to reduce system call overhead on large files.
import tempfile, os, hashlib

# Setup: create a file with known content
content = b"The quick brown fox jumps over the lazy dog" * 1000
tmp = tempfile.NamedTemporaryFile("wb", suffix=".bin", delete=False)
tmp.write(content)
tmp.close()
filepath = tmp.name

# Compute expected hash for verification
expected = hashlib.sha256(content).hexdigest()


def chunked_sha256(path, chunk_size=4096):
  # TODO: Compute the SHA-256 hash of a file using chunked reads.
  # Open in binary mode, read chunk_size bytes at a time,
  # feed each chunk to the hasher, and return the hex digest.
  # This approach works on files of any size without loading
  # the entire file into memory.
  pass


# Test
result = chunked_sha256(filepath)
print(result)
print(result == expected)

# Test with different chunk size
result2 = chunked_sha256(filepath, chunk_size=128)
print(result2 == expected)

os.unlink(filepath)
Expected Output
4f57a5b51edd0e1e8871e547e18a23b41bbf5ee4c006a61f48e8baed04f67d0c
True
True
Hints

Hint 1: Create a hashlib.sha256() hasher, then read in a while loop with f.read(chunk_size).

Hint 2: Break the loop when f.read() returns an empty bytes object b''.

Hint 3: Call hasher.update(chunk) for each chunk, then return hasher.hexdigest() at the end.

#10Streaming Log AggregatorHard
generatorsstreaminglarge filesconstant memory

Write a function aggregate_log(path) that streams a log file and produces an aggregation dictionary. Each log line follows the format [LEVEL] YYYY-MM-DD HH:MM:SS - message.

Return a dict with:

  • "total_lines" — count of non-empty lines
  • "level_counts" — dict mapping each level to its count
  • "first_timestamp" and "last_timestamp" — the time range
  • "error_messages" — list of message strings from ERROR lines only

The function must use streaming (no read() or readlines()).

Python
import tempfile, os
from collections import Counter

# Setup: create a log file
log_lines = [
    "[INFO] 2024-01-15 10:00:01 - Server started",
    "[INFO] 2024-01-15 10:00:02 - Loading config",
    "[WARNING] 2024-01-15 10:00:03 - Deprecated API used",
    "[ERROR] 2024-01-15 10:00:05 - Database timeout",
    "[INFO] 2024-01-15 10:00:06 - Retrying connection",
    "[ERROR] 2024-01-15 10:00:10 - Query failed",
    "[INFO] 2024-01-15 10:00:12 - Connection restored",
    "[WARNING] 2024-01-15 10:00:15 - Slow query detected",
    "[INFO] 2024-01-15 10:00:20 - Request processed",
    "[ERROR] 2024-01-15 10:00:25 - Out of memory",
    "[INFO] 2024-01-15 10:00:30 - Garbage collection",
    "[INFO] 2024-01-15 10:00:35 - Health check OK",
]

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write("\n".join(log_lines) + "\n")
tmp.close()
filepath = tmp.name


def aggregate_log(path):
    total = 0
    level_counts = {}
    first_ts = None
    last_ts = None
    error_messages = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.rstrip("\n")
            if not line:
                continue
            total += 1

            bracket_end = line.index("]")
            level = line[1:bracket_end]
            rest = line[bracket_end + 2:]

            timestamp_part, _, message = rest.partition(" - ")
            timestamp = timestamp_part.strip()

            level_counts[level] = level_counts.get(level, 0) + 1

            if first_ts is None:
                first_ts = timestamp
            last_ts = timestamp

            if level == "ERROR":
                error_messages.append(message)

    return {
        "total_lines": total,
        "level_counts": level_counts,
        "first_timestamp": first_ts,
        "last_timestamp": last_ts,
        "error_messages": error_messages,
    }


# Test
result = aggregate_log(filepath)
print(result["total_lines"])
print(sorted(result["level_counts"].items()))
print(result["first_timestamp"])
print(result["last_timestamp"])
for msg in result["error_messages"]:
    print("ERROR:", msg)

os.unlink(filepath)
Solution
def aggregate_log(path):
total = 0
level_counts = {}
first_ts = None
last_ts = None
error_messages = []

with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.rstrip("\n")
if not line:
continue
total += 1

bracket_end = line.index("]")
level = line[1:bracket_end]
rest = line[bracket_end + 2:]

timestamp_part, _, message = rest.partition(" - ")
timestamp = timestamp_part.strip()

level_counts[level] = level_counts.get(level, 0) + 1

if first_ts is None:
first_ts = timestamp
last_ts = timestamp

if level == "ERROR":
error_messages.append(message)

return {
"total_lines": total,
"level_counts": level_counts,
"first_timestamp": first_ts,
"last_timestamp": last_ts,
"error_messages": error_messages,
}

Key points:

  • Single-pass streaming: the file is read line by line with for line in f, using O(1) memory per line.
  • line.index("]") finds the closing bracket. line[1:bracket_end] extracts the level string.
  • str.partition(" - ") splits on the first occurrence only, safely handling messages that contain -.
  • dict.get(level, 0) + 1 is a clean pattern for counting without defaultdict or Counter.
  • This approach scales to multi-gigabyte log files with constant memory usage.
import tempfile, os
from collections import Counter

# Setup: create a log file
log_lines = [
  "[INFO] 2024-01-15 10:00:01 - Server started",
  "[INFO] 2024-01-15 10:00:02 - Loading config",
  "[WARNING] 2024-01-15 10:00:03 - Deprecated API used",
  "[ERROR] 2024-01-15 10:00:05 - Database timeout",
  "[INFO] 2024-01-15 10:00:06 - Retrying connection",
  "[ERROR] 2024-01-15 10:00:10 - Query failed",
  "[INFO] 2024-01-15 10:00:12 - Connection restored",
  "[WARNING] 2024-01-15 10:00:15 - Slow query detected",
  "[INFO] 2024-01-15 10:00:20 - Request processed",
  "[ERROR] 2024-01-15 10:00:25 - Out of memory",
  "[INFO] 2024-01-15 10:00:30 - Garbage collection",
  "[INFO] 2024-01-15 10:00:35 - Health check OK",
]

tmp = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".log", delete=False)
tmp.write("\n".join(log_lines) + "\n")
tmp.close()
filepath = tmp.name


def aggregate_log(path):
  # TODO: Stream the log file line by line.
  # Each line has format: [LEVEL] YYYY-MM-DD HH:MM:SS - message
  #
  # Return a dict with:
  #   "total_lines": int - total number of non-empty lines
  #   "level_counts": dict - mapping level string to count
  #   "first_timestamp": str - first timestamp seen (YYYY-MM-DD HH:MM:SS)
  #   "last_timestamp": str - last timestamp seen
  #   "error_messages": list - list of message strings from ERROR lines only
  #
  # Must work in constant memory (aside from error_messages list).
  pass


# Test
result = aggregate_log(filepath)
print(result["total_lines"])
print(sorted(result["level_counts"].items()))
print(result["first_timestamp"])
print(result["last_timestamp"])
for msg in result["error_messages"]:
  print("ERROR:", msg)

os.unlink(filepath)
Expected Output
12
[('ERROR', 3), ('INFO', 7), ('WARNING', 2)]
2024-01-15 10:00:01
2024-01-15 10:00:35
ERROR: Database timeout
ERROR: Query failed
ERROR: Out of memory
Hints

Hint 1: Parse each line: extract the level between [ and ], then split the rest to get timestamp and message.

Hint 2: Use line[1:line.index(']')] to get the level. The rest after '] ' contains timestamp and message.

Hint 3: Split the rest on ' - ' with maxsplit=1 to separate the timestamp portion from the message.

#11Multi-Encoding File ReaderHard
encoding detectionlatin-1utf-8fallback strategy

Write a function smart_read(path, encodings=None) that attempts to read a file using multiple encodings in order.

Default encodings: ["utf-8", "latin-1", "ascii"]. For each encoding, try reading with errors="strict". On success, return (content, encoding_used). If all fail, raise ValueError.

This tests understanding of encoding fallback strategies. Note the subtle behavior: Latin-1 accepts every possible byte, so it never fails — making encoding order critical.

Python
import tempfile, os

# Setup: create files with different encodings
utf8_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
utf8_file.write("Hello caf\u00e9 \u2014 world".encode("utf-8"))
utf8_file.close()

latin1_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
latin1_file.write("Hello caf\u00e9 world".encode("latin-1"))
latin1_file.close()

ascii_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
ascii_file.write(b"Hello plain world")
ascii_file.close()


def smart_read(path, encodings=None):
    if encodings is None:
        encodings = ["utf-8", "latin-1", "ascii"]
    for enc in encodings:
        try:
            with open(path, "r", encoding=enc, errors="strict") as f:
                content = f.read()
            return (content, enc)
        except (UnicodeDecodeError, ValueError):
            continue
    raise ValueError("Could not decode file with any of the provided encodings")


# Test
content1, enc1 = smart_read(utf8_file.name)
print(enc1 + ":", repr(content1))

content2, enc2 = smart_read(latin1_file.name)
print(enc2 + ":", repr(content2))

content3, enc3 = smart_read(ascii_file.name)
print(enc3 + ":", repr(content3))

# Test with custom encoding order
content4, enc4 = smart_read(latin1_file.name, encodings=["ascii", "latin-1"])
print(enc4 + ":", repr(content4))

for f in [utf8_file.name, latin1_file.name, ascii_file.name]:
    os.unlink(f)
Solution
def smart_read(path, encodings=None):
if encodings is None:
encodings = ["utf-8", "latin-1", "ascii"]
for enc in encodings:
try:
with open(path, "r", encoding=enc, errors="strict") as f:
content = f.read()
return (content, enc)
except (UnicodeDecodeError, ValueError):
continue
raise ValueError("Could not decode file with any of the provided encodings")

Key points:

  • Encoding order matters critically. Latin-1 (ISO-8859-1) maps every byte 0x00-0xFF to a character, so it never raises UnicodeDecodeError. If listed first, it would always "succeed" — even on UTF-8 files, producing garbled output.
  • UTF-8 should be tried first because it is the most common modern encoding and has strict validation (invalid byte sequences are rejected).
  • The Latin-1 file read as UTF-8 returns garbled text (the bytes are valid UTF-8 sequences that decode to wrong characters). This demonstrates why encoding detection is inherently imperfect.
  • Production tools like chardet use statistical analysis of byte patterns to guess encodings with confidence scores.
  • Catching both UnicodeDecodeError and ValueError handles edge cases with invalid encoding names.
import tempfile, os

# Setup: create files with different encodings
utf8_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
utf8_file.write("Hello café — world".encode("utf-8"))
utf8_file.close()

latin1_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
latin1_file.write("Hello café world".encode("latin-1"))
latin1_file.close()

ascii_file = tempfile.NamedTemporaryFile("wb", suffix=".txt", delete=False)
ascii_file.write(b"Hello plain world")
ascii_file.close()


def smart_read(path, encodings=None):
  # TODO: Try to read the file using each encoding in the list, in order.
  # If encodings is None, default to ["utf-8", "latin-1", "ascii"].
  # For each encoding, attempt to read with errors="strict".
  # Return a tuple of (content_string, encoding_used).
  # If the encoding works, return immediately.
  # If all encodings fail, raise a ValueError with the message
  # "Could not decode file with any of the provided encodings".
  pass


# Test
content1, enc1 = smart_read(utf8_file.name)
print(enc1 + ":", repr(content1))

content2, enc2 = smart_read(latin1_file.name)
print(enc2 + ":", repr(content2))

content3, enc3 = smart_read(ascii_file.name)
print(enc3 + ":", repr(content3))

# Test with custom encoding order
content4, enc4 = smart_read(latin1_file.name, encodings=["ascii", "latin-1"])
print(enc4 + ":", repr(content4))

for f in [utf8_file.name, latin1_file.name, ascii_file.name]:
  os.unlink(f)
Expected Output
utf-8: 'Hello café — world'
utf-8: 'Hello café world'
utf-8: 'Hello plain world'
latin-1: 'Hello café world'
Hints

Hint 1: Use a for loop over the encodings list. Inside, try opening with errors='strict'.

Hint 2: Catch UnicodeDecodeError and continue to the next encoding.

Hint 3: If the read succeeds, return (content, encoding) immediately.

Hint 4: Note: Latin-1 never raises UnicodeDecodeError because every byte 0x00-0xFF is valid in Latin-1.

© 2026 EngineersOfAI. All rights reserved.