CSV and Tabular Files
What You'll Learn
How to read and write CSV files reliably, handle edge cases like quotes and encoding, and process tabular data efficiently.
Reading CSV Files
Python's built-in csv module handles quoting, escaping, and delimiters correctly:
import csv
from pathlib import Path
# Read into a list of dicts (most useful)
with open("data.csv", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
# rows is a list of dicts
# [{"name": "Alice", "age": "30", "score": "95"}, ...]
print(rows[0]["name"]) # Alice
Why newline=""? Without it, Python may add extra blank lines on Windows.
Reading as Lists (Without Headers)
with open("data.csv", newline="", encoding="utf-8") as f:
reader = csv.reader(f)
header = next(reader) # first row = header
for row in reader:
print(row) # ['Alice', '30', '95']
Handling Different Delimiters
# Tab-separated values (TSV)
with open("data.tsv", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter="\t")
rows = list(reader)
# Semicolon-separated (common in European Excel exports)
with open("data.csv", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter=";")
rows = list(reader)
Writing CSV Files
import csv
rows = [
{"name": "Alice", "age": 30, "score": 95},
{"name": "Bob", "age": 25, "score": 82},
]
fieldnames = ["name", "age", "score"]
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader() # write the header row
writer.writerows(rows) # write all rows
Writing Rows as Lists
with open("output.csv", "w", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["name", "age", "score"]) # header
writer.writerow(["Alice", 30, 95]) # one row
writer.writerows([["Bob", 25, 82], ["Carol", 35, 88]]) # many rows
Type Conversion — CSV Values Are Always Strings
CSV readers return everything as strings. Convert explicitly:
import csv
with open("data.csv", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
records = []
for row in reader:
records.append({
"name": row["name"],
"age": int(row["age"]), # convert to int
"score": float(row["score"]), # convert to float
"active": row["active"] == "true", # convert to bool
})
Filtering and Transforming CSV Data
import csv
with open("students.csv", newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
rows = list(reader)
# Filter — only passing students
passing = [r for r in rows if float(r["score"]) >= 70]
# Sort by score descending
sorted_rows = sorted(rows, key=lambda r: float(r["score"]), reverse=True)
# Add a computed column
for r in rows:
score = float(r["score"])
r["grade"] = "A" if score >= 90 else "B" if score >= 80 else "C" if score >= 70 else "F"
# Write the transformed data
with open("graded.csv", "w", newline="", encoding="utf-8") as f:
fieldnames = ["name", "score", "grade"]
writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
writer.writeheader()
writer.writerows(rows)
Handling Large CSV Files (Memory Efficient)
For files with millions of rows, don't load everything into memory:
import csv
def process_large_csv(input_path: str, output_path: str) -> dict:
stats = {"read": 0, "written": 0, "skipped": 0}
with open(input_path, newline="", encoding="utf-8") as fin, \
open(output_path, "w", newline="", encoding="utf-8") as fout:
reader = csv.DictReader(fin)
fieldnames = reader.fieldnames + ["grade"]
writer = csv.DictWriter(fout, fieldnames=fieldnames)
writer.writeheader()
for row in reader: # one row at a time — O(1) memory
stats["read"] += 1
try:
score = float(row["score"])
row["grade"] = "A" if score >= 90 else "B" if score >= 80 else "C"
writer.writerow(row)
stats["written"] += 1
except (ValueError, KeyError):
stats["skipped"] += 1
return stats
Detecting Encoding
Files from Windows often use UTF-8 BOM or latin-1:
# UTF-8 with BOM (from Excel on Windows)
with open("data.csv", newline="", encoding="utf-8-sig") as f:
reader = csv.DictReader(f)
rows = list(reader)
# Latin-1 (Windows default for many locales)
with open("data.csv", newline="", encoding="latin-1") as f:
reader = csv.DictReader(f)
rows = list(reader)
When encoding is unknown, try chardet:
pip install chardet
import chardet
with open("data.csv", "rb") as f:
result = chardet.detect(f.read(10000))
encoding = result["encoding"]
print(f"Detected encoding: {encoding}")
CSV with pandas (For Heavy Data Work)
For complex analysis, pandas is much more powerful:
pip install pandas
import pandas as pd
# Read
df = pd.read_csv("data.csv")
# Filter
passing = df[df["score"] >= 70]
# Sort
sorted_df = df.sort_values("score", ascending=False)
# Add column
df["grade"] = df["score"].apply(
lambda s: "A" if s >= 90 else "B" if s >= 80 else "C"
)
# Summary statistics
print(df["score"].describe())
# Write
df.to_csv("output.csv", index=False)
Use csv module for: scripts, simple tasks, no dependencies
Use pandas for: analysis, large datasets, transformations
Common Mistakes
| Mistake | Fix |
|---|---|
Forgetting newline="" | Always use newline="" with csv module |
| Assuming strings are numbers | Cast: int(), float() |
Wrong delimiter (, vs ; vs \t) | Specify delimiter= explicitly |
| Encoding errors | Try utf-8-sig for Excel exports, latin-1 for old Windows files |
| Loading huge file to memory | Iterate row by row instead of list(reader) |
Quick Reference
import csv
# Read as dicts
with open("f.csv", newline="", encoding="utf-8") as f:
rows = list(csv.DictReader(f))
# Write dicts
with open("f.csv", "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["a", "b"])
w.writeheader()
w.writerows(rows)
# TSV
csv.DictReader(f, delimiter="\t")
# Excel export (UTF-8 BOM)
open("f.csv", encoding="utf-8-sig")
# Large files — iterate, don't list()
for row in csv.DictReader(f):
process(row)