Extract text from PDFs with position data.
pip install pdf-stringsfrom pdf_strings import from_path
# Extract text from a PDF
output = from_path("document.pdf")
print(output) # Plain textExtract text from a PDF file.
Parameters:
path(str): Path to the PDF filepassword(str, optional): Password for encrypted PDFs
Returns: TextOutput object containing structured lines and spans
Example:
from pdf_strings import from_path
# Basic usage
output = from_path("document.pdf")
# With password
output = from_path("encrypted.pdf", password="secret")Extract text from PDF bytes.
Parameters:
data(bytes): PDF file contents as bytespassword(str, optional): Password for encrypted PDFs
Returns: TextOutput object containing structured lines and spans
Example:
from pdf_strings import from_bytes
with open("document.pdf", "rb") as f:
data = f.read()
output = from_bytes(data)Container for extracted text with structured data.
Attributes:
lines(List[List[TextSpan]]): Lines of text, each containing multiple spans
Methods:
Get plain text output (concatenates all text with spaces).
output = from_path("document.pdf")
plain_text = output.to_string()
# or simply:
plain_text = str(output)Get formatted text that preserves spatial layout using a character grid.
output = from_path("document.pdf")
formatted_text = output.to_string_pretty()
# or using format spec:
formatted_text = f"{output:#}"Magic Methods:
__str__(): Returns plain text (same asto_string())__format__(format_spec): Use#for pretty formatting:f"{output:#}"
A span of text with position and metadata.
Attributes:
text(str): The text contentbbox(BoundingBox): Bounding box coordinatesfont_size(float): Font size in pointspage(int): Page number (0-indexed)
Example:
output = from_path("document.pdf")
for line in output.lines:
for span in line:
print(f"'{span.text}' at size {span.font_size}pt on page {span.page}")
print(f" Position: {span.bbox}")Bounding box coordinates for a text span.
Attributes:
top(float): Top coordinateright(float): Right coordinatebottom(float): Bottom coordinateleft(float): Left coordinate
String representation: (top, right, bottom, left) following HTML margin convention.
Example:
bbox = span.bbox
print(f"Top-left: ({bbox.left}, {bbox.top})")
print(f"Width: {bbox.right - bbox.left}")
print(f"Height: {bbox.top - bbox.bottom}")from pdf_strings import from_path
output = from_path("document.pdf")
print(output.to_string())from pdf_strings import from_path
output = from_path("invoice.pdf")
# Character grid rendering preserves columns and spacing
print(output.to_string_pretty())from pdf_strings import from_path
output = from_path("document.pdf")
for line_idx, line in enumerate(output.lines):
print(f"Line {line_idx}:")
for span in line:
print(f" {span.text}")
print(f" Font size: {span.font_size}")
print(f" Position: ({span.bbox.left}, {span.bbox.top})")
print(f" Page: {span.page}")from pdf_strings import from_path
output = from_path("document.pdf")
# Find text in the top-right corner
for line in output.lines:
for span in line:
if span.bbox.top < 100 and span.bbox.left > 400:
print(f"Top-right text: {span.text}")from pdf_strings import from_path
output = from_path("table.pdf")
# Group spans by their vertical position (rows)
rows = {}
for line in output.lines:
for span in line:
row_key = round(span.bbox.top / 10) * 10 # Group by ~10pt vertical bands
if row_key not in rows:
rows[row_key] = []
rows[row_key].append((span.bbox.left, span.text))
# Print rows sorted by vertical position
for y_pos in sorted(rows.keys(), reverse=True):
# Sort spans in each row by horizontal position
row_spans = sorted(rows[y_pos], key=lambda x: x[0])
print(" | ".join(text for _, text in row_spans))- Plain text extraction
- Spatial layout preservation via character grid
- Bounding box coordinates for every text span
- Font size and page information
- Password-protected PDF support
- Handles complex fonts, rotated text, and multi-column layouts
- Works with all Python 3.11+ versions
MIT