Skip to content

Commit

Permalink
Clearer variables for image cropping (Azure-Samples#2298)
Browse files Browse the repository at this point in the history
  • Loading branch information
pamelafox authored and dfl-aeb committed Feb 19, 2025
1 parent 035de73 commit b06c209
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions app/backend/prepdocslib/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,25 +224,25 @@ def table_to_html(table: DocumentTable):

@staticmethod
def crop_image_from_pdf_page(
doc: pymupdf.Document, page_number: int, bounding_box: tuple[float, float, float, float]
doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float]
) -> bytes:
"""
Crops a region from a given page in a PDF and returns it as an image.
:param pdf_path: Path to the PDF file.
:param page_number: The page number to crop from (0-indexed).
:param bounding_box: A tuple of (x0, y0, x1, y1) coordinates for the bounding box.
:param bbox_inches: A tuple of (x0, y0, x1, y1) coordinates for the bounding box, in inches.
:return: A PIL Image of the cropped area.
"""
# Scale the bounding box to 72 DPI
bbox_dpi = 72
bbox_pixels = [x * bbox_dpi for x in bbox_inches]
rect = pymupdf.Rect(bbox_pixels)
# Assume that the PDF has 300 DPI,
# and use the matrix to convert between the 2 DPIs
page_dpi = 300
page = doc.load_page(page_number)

# Cropping the page. The rect requires the coordinates in the format (x0, y0, x1, y1).
bbx = [x * 72 for x in bounding_box]
rect = pymupdf.Rect(bbx)
# Bounding box is scaled to 72 dots per inch
# We assume the PDF has 300 DPI
# The matrix is used to convert between these 2 units
pix = page.get_pixmap(matrix=pymupdf.Matrix(300 / 72, 300 / 72), clip=rect)
pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect)

img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
bytes_io = io.BytesIO()
Expand Down

0 comments on commit b06c209

Please sign in to comment.