@@ -190,10 +190,10 @@ def start_parse_il(
190
190
# box[y0:y1, x0:x1] = 0
191
191
# layout[page.pageno] = box
192
192
# 新建一个 xref 存放新指令流
193
- page .page_xref = doc_zh .get_new_xref () # hack 插入页面的新 xref
194
- doc_zh .update_object (page .page_xref , "<<>>" )
195
- doc_zh .update_stream (page .page_xref , b"" )
196
- doc_zh [page .pageno ].set_contents (page .page_xref )
193
+ # page.page_xref = doc_zh.get_new_xref() # hack 插入页面的新 xref
194
+ # doc_zh.update_object(page.page_xref, "<<>>")
195
+ # doc_zh.update_stream(page.page_xref, b"")
196
+ # doc_zh[page.pageno].set_contents(page.page_xref)
197
197
ops_base = interpreter .process_page (page )
198
198
il_creater .on_page_base_operation (ops_base )
199
199
il_creater .on_page_end ()
@@ -353,6 +353,21 @@ def _monitor_memory_usage(self):
353
353
time .sleep (self .interval )
354
354
355
355
356
+ def fix_null_xref (doc : Document ) -> None :
357
+ """Fix null xref in PDF file by replacing them with empty arrays.
358
+
359
+ Args:
360
+ doc: PyMuPDF Document object to fix
361
+ """
362
+ for i in range (1 , doc .xref_length ()):
363
+ try :
364
+ obj = doc .xref_object (i )
365
+ if obj == "null" :
366
+ doc .update_object (i , "[]" )
367
+ except Exception :
368
+ doc .update_object (i , "[]" )
369
+
370
+
356
371
def do_translate (
357
372
pm : ProgressMonitor , translation_config : TranslationConfig
358
373
) -> TranslateResult :
@@ -507,12 +522,14 @@ def _do_translate_single(
507
522
"""Original translation logic for a single document or part"""
508
523
translation_config .progress_monitor = pm
509
524
original_pdf_path = translation_config .input_file
510
- doc_input = Document (original_pdf_path )
511
525
if translation_config .debug :
526
+ doc_input = Document (original_pdf_path )
512
527
logger .debug ("debug mode, save decompressed input pdf" )
513
528
output_path = translation_config .get_working_file_path (
514
529
"input.decompressed.pdf" ,
515
530
)
531
+ # Fix null xref in PDF file
532
+ fix_null_xref (doc_input )
516
533
doc_input .save (output_path , expand = True , pretty = True )
517
534
518
535
# Continue with original processing
@@ -521,29 +538,15 @@ def _do_translate_single(
521
538
resfont = "china-ss"
522
539
523
540
# Fix null xref in PDF file
524
- for i in range (1 , doc_pdf2zh .xref_length ()):
525
- try :
526
- obj = doc_pdf2zh .xref_object (i )
527
- if obj == "null" :
528
- ret = doc_pdf2zh .update_object (i , "[]" )
529
- if ret != 0 :
530
- logger .warning (f"try fix1 xref { i } fail, continue" )
531
- else :
532
- logger .info (f"try fix1 xref { i } success" )
533
- except Exception :
534
- ret = doc_pdf2zh .update_object (i , "[]" )
535
- if ret != 0 :
536
- logger .warning (f"try fix2 xref { i } fail, continue" )
537
- else :
538
- logger .info (f"try fix2 xref { i } success" )
541
+ fix_null_xref (doc_pdf2zh )
539
542
540
543
for page in doc_pdf2zh :
541
544
page .insert_font (resfont , None )
542
545
543
546
resfont = None
544
547
doc_pdf2zh .save (temp_pdf_path )
545
548
il_creater = ILCreater (translation_config )
546
- il_creater .mupdf = doc_input
549
+ il_creater .mupdf = doc_pdf2zh
547
550
xml_converter = XMLConverter ()
548
551
logger .debug (f"start parse il from { temp_pdf_path } " )
549
552
with Path (temp_pdf_path ).open ("rb" ) as f :
@@ -578,7 +581,7 @@ def _do_translate_single(
578
581
579
582
# Generate layouts for all pages
580
583
logger .debug ("start generating layouts" )
581
- docs = LayoutParser (translation_config ).process (docs , doc_input )
584
+ docs = LayoutParser (translation_config ).process (docs , doc_pdf2zh )
582
585
logger .debug ("finish generating layouts" )
583
586
if translation_config .debug :
584
587
xml_converter .write_json (
@@ -634,7 +637,7 @@ def _do_translate_single(
634
637
635
638
if translation_config .watermark_output_mode == WatermarkOutputMode .Both :
636
639
mono_watermark_first_page_doc_bytes , dual_watermark_first_page_doc_bytes = (
637
- generate_first_page_with_watermark (doc_input , translation_config , docs )
640
+ generate_first_page_with_watermark (doc_pdf2zh , translation_config , docs )
638
641
)
639
642
640
643
Typesetting (translation_config ).typsetting_document (docs )
0 commit comments