Skip to content

Commit 309fbfc

Browse files
authored
fix doc parser (#4975)
1 parent 46eb86c commit 309fbfc

File tree

1 file changed

+9
-6
lines changed

1 file changed

+9
-6
lines changed

paddlenlp/utils/doc_parser.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import numpy as np
2424
import requests
2525
from packaging.version import Version
26-
from PIL import Image, ImageDraw
26+
from PIL import Image, ImageDraw, ImageOps
2727

2828
from .image_utils import np2base64
2929
from .log import logger
@@ -187,7 +187,8 @@ def read_image(self, image):
187187
"""
188188
image_buff = self._get_buffer(image)
189189

190-
_image = np.array(Image.open(BytesIO(image_buff)).convert("RGB"))
190+
# Use exif_transpose to correct orientation
191+
_image = np.array(ImageOps.exif_transpose(Image.open(BytesIO(image_buff)).convert("RGB")))
191192
return _image
192193

193194
@classmethod
@@ -216,16 +217,18 @@ def read_pdf(self, pdf, password=None):
216217
logger.warning("Currently only parse the first page for PDF input with more than one page.")
217218

218219
page = pdf_doc.load_page(0)
219-
image = np.array(self.get_page_image(page).convert("RGB"))
220+
# The original image is shrunk when convertd from PDF by fitz, so we scale the image size by 10 times
221+
matrix = fitz.Matrix(10, 10)
222+
image = np.array(self.get_page_image(page, matrix).convert("RGB"))
220223
return image
221224

222225
@classmethod
223-
def get_page_image(self, page):
226+
def get_page_image(self, page, matrix):
224227
"""
225228
get page image
226229
"""
227-
pix = page.get_pixmap()
228-
image_buff = pix.pil_tobytes("jpeg", optimize=True)
230+
pix = page.get_pixmap(matrix=matrix)
231+
image_buff = pix.pil_tobytes("jpeg")
229232
return Image.open(BytesIO(image_buff))
230233

231234
def init_ocr_inference(self):

0 commit comments

Comments
 (0)