Skip to content

Commit cc3ee81

Browse files
authored
Merge pull request #173 from funstory-ai/pdf2zh-v2-rc
fix(high_level): implement null xref handling in PDF processing
2 parents 1f4dc8e + aa76146 commit cc3ee81

File tree

8 files changed

+51
-43
lines changed

8 files changed

+51
-43
lines changed

babeldoc/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.2.13"
1+
__version__ = "0.2.14"

babeldoc/const.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import subprocess
44
from pathlib import Path
55

6-
__version__ = "0.2.13"
6+
__version__ = "0.2.14"
77

88
CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
99

babeldoc/document_il/backend/pdf_creater.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,12 @@ def write(self, translation_config: TranslationConfig) -> TranslateResult:
730730
draw_op.append(b" Tj ET Q \n")
731731
for xobj in page.pdf_xobject:
732732
draw_op = xobj_draw_ops[xobj.xobj_id]
733-
pdf.update_stream(xobj.xref_id, draw_op.tobytes())
733+
try:
734+
pdf.update_stream(xobj.xref_id, draw_op.tobytes())
735+
except Exception:
736+
logger.warning(
737+
f"update xref {xobj.xref_id} stream fail, continue"
738+
)
734739
# pdf.update_stream(xobj.xref_id, b'')
735740
for rect in page.pdf_rectangle:
736741
self._debug_render_rectangle(page_op, rect)

babeldoc/high_level.py

+26-23
Original file line numberDiff line numberDiff line change
@@ -190,10 +190,10 @@ def start_parse_il(
190190
# box[y0:y1, x0:x1] = 0
191191
# layout[page.pageno] = box
192192
# 新建一个 xref 存放新指令流
193-
page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref
194-
doc_zh.update_object(page.page_xref, "<<>>")
195-
doc_zh.update_stream(page.page_xref, b"")
196-
doc_zh[page.pageno].set_contents(page.page_xref)
193+
# page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref
194+
# doc_zh.update_object(page.page_xref, "<<>>")
195+
# doc_zh.update_stream(page.page_xref, b"")
196+
# doc_zh[page.pageno].set_contents(page.page_xref)
197197
ops_base = interpreter.process_page(page)
198198
il_creater.on_page_base_operation(ops_base)
199199
il_creater.on_page_end()
@@ -353,6 +353,21 @@ def _monitor_memory_usage(self):
353353
time.sleep(self.interval)
354354

355355

356+
def fix_null_xref(doc: Document) -> None:
357+
"""Fix null xref in PDF file by replacing them with empty arrays.
358+
359+
Args:
360+
doc: PyMuPDF Document object to fix
361+
"""
362+
for i in range(1, doc.xref_length()):
363+
try:
364+
obj = doc.xref_object(i)
365+
if obj == "null":
366+
doc.update_object(i, "[]")
367+
except Exception:
368+
doc.update_object(i, "[]")
369+
370+
356371
def do_translate(
357372
pm: ProgressMonitor, translation_config: TranslationConfig
358373
) -> TranslateResult:
@@ -507,12 +522,14 @@ def _do_translate_single(
507522
"""Original translation logic for a single document or part"""
508523
translation_config.progress_monitor = pm
509524
original_pdf_path = translation_config.input_file
510-
doc_input = Document(original_pdf_path)
511525
if translation_config.debug:
526+
doc_input = Document(original_pdf_path)
512527
logger.debug("debug mode, save decompressed input pdf")
513528
output_path = translation_config.get_working_file_path(
514529
"input.decompressed.pdf",
515530
)
531+
# Fix null xref in PDF file
532+
fix_null_xref(doc_input)
516533
doc_input.save(output_path, expand=True, pretty=True)
517534

518535
# Continue with original processing
@@ -521,29 +538,15 @@ def _do_translate_single(
521538
resfont = "china-ss"
522539

523540
# Fix null xref in PDF file
524-
for i in range(1, doc_pdf2zh.xref_length()):
525-
try:
526-
obj = doc_pdf2zh.xref_object(i)
527-
if obj == "null":
528-
ret = doc_pdf2zh.update_object(i, "[]")
529-
if ret != 0:
530-
logger.warning(f"try fix1 xref {i} fail, continue")
531-
else:
532-
logger.info(f"try fix1 xref {i} success")
533-
except Exception:
534-
ret = doc_pdf2zh.update_object(i, "[]")
535-
if ret != 0:
536-
logger.warning(f"try fix2 xref {i} fail, continue")
537-
else:
538-
logger.info(f"try fix2 xref {i} success")
541+
fix_null_xref(doc_pdf2zh)
539542

540543
for page in doc_pdf2zh:
541544
page.insert_font(resfont, None)
542545

543546
resfont = None
544547
doc_pdf2zh.save(temp_pdf_path)
545548
il_creater = ILCreater(translation_config)
546-
il_creater.mupdf = doc_input
549+
il_creater.mupdf = doc_pdf2zh
547550
xml_converter = XMLConverter()
548551
logger.debug(f"start parse il from {temp_pdf_path}")
549552
with Path(temp_pdf_path).open("rb") as f:
@@ -578,7 +581,7 @@ def _do_translate_single(
578581

579582
# Generate layouts for all pages
580583
logger.debug("start generating layouts")
581-
docs = LayoutParser(translation_config).process(docs, doc_input)
584+
docs = LayoutParser(translation_config).process(docs, doc_pdf2zh)
582585
logger.debug("finish generating layouts")
583586
if translation_config.debug:
584587
xml_converter.write_json(
@@ -634,7 +637,7 @@ def _do_translate_single(
634637

635638
if translation_config.watermark_output_mode == WatermarkOutputMode.Both:
636639
mono_watermark_first_page_doc_bytes, dual_watermark_first_page_doc_bytes = (
637-
generate_first_page_with_watermark(doc_input, translation_config, docs)
640+
generate_first_page_with_watermark(doc_pdf2zh, translation_config, docs)
638641
)
639642

640643
Typesetting(translation_config).typsetting_document(docs)

babeldoc/main.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from babeldoc.translation_config import WatermarkOutputMode
2323

2424
logger = logging.getLogger(__name__)
25-
__version__ = "0.2.13"
25+
__version__ = "0.2.14"
2626

2727

2828
def create_parser():

babeldoc/pdfinterp.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -337,14 +337,14 @@ def process_page(self, page: PDFPage) -> None:
337337
ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
338338
self.device.fontid = self.fontid
339339
self.device.fontmap = self.fontmap
340-
ops_new = self.device.end_page(page)
340+
_ops_new = self.device.end_page(page)
341341
# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
342-
self.obj_patch[page.page_xref] = (
343-
# f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
344-
""
345-
)
346-
for obj in page.contents:
347-
self.obj_patch[obj.objid] = ""
342+
# self.obj_patch[page.page_xref] = (
343+
# # f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
344+
# ""
345+
# )
346+
# for obj in page.contents:
347+
# self.obj_patch[obj.objid] = ""
348348
return ops_base
349349

350350
def render_contents(

pyproject.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "BabelDOC"
3-
version = "0.2.13"
3+
version = "0.2.14"
44
description = "Yet Another Document Translator"
55
license = "AGPL-3.0"
66
readme = "README.md"
@@ -147,7 +147,7 @@ pythonpath = [".", "src"]
147147
testpaths = ["tests"]
148148

149149
[bumpver]
150-
current_version = "0.2.13"
150+
current_version = "0.2.14"
151151
version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
152152

153153
[bumpver.file_patterns]

uv.lock

+7-7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)