Skip to content

Commit a294d38

Browse files
committed
[api-minor] Don't add in the text content the chars which are out-of-page (bug 1755201)
- it aims to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1755201; - if the glyph position is not within the cropBox then skip it.
1 parent 7824671 commit a294d38

File tree

5 files changed

+57
-13
lines changed

5 files changed

+57
-13
lines changed

src/core/document.js

+1
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,7 @@ class Page {
471471
includeMarkedContent,
472472
combineTextItems,
473473
sink,
474+
viewBox: this.view,
474475
});
475476
});
476477
}

src/core/evaluator.js

+33-13
Original file line numberDiff line numberDiff line change
@@ -2167,6 +2167,7 @@ class PartialEvaluator {
21672167
includeMarkedContent = false,
21682168
sink,
21692169
seenStyles = new Set(),
2170+
viewBox,
21702171
}) {
21712172
// Ensure that `resources`/`stateManager` is correctly initialized,
21722173
// even if the provided parameter is e.g. `null`.
@@ -2393,22 +2394,35 @@ class PartialEvaluator {
23932394
}
23942395

23952396
function compareWithLastPosition() {
2397+
const currentTransform = getCurrentTextTransform();
2398+
let posX = currentTransform[4];
2399+
let posY = currentTransform[5];
2400+
2401+
const shiftedX = posX - viewBox[0];
2402+
const shiftedY = posY - viewBox[1];
2403+
2404+
if (
2405+
shiftedX < 0 ||
2406+
shiftedX > viewBox[2] ||
2407+
shiftedY < 0 ||
2408+
shiftedY > viewBox[3]
2409+
) {
2410+
return false;
2411+
}
2412+
23962413
if (
23972414
!combineTextItems ||
23982415
!textState.font ||
23992416
!textContentItem.prevTransform
24002417
) {
2401-
return;
2418+
return true;
24022419
}
24032420

2404-
const currentTransform = getCurrentTextTransform();
2405-
let posX = currentTransform[4];
2406-
let posY = currentTransform[5];
24072421
let lastPosX = textContentItem.prevTransform[4];
24082422
let lastPosY = textContentItem.prevTransform[5];
24092423

24102424
if (lastPosX === posX && lastPosY === posY) {
2411-
return;
2425+
return true;
24122426
}
24132427

24142428
let rotate = -1;
@@ -2473,16 +2487,16 @@ class PartialEvaluator {
24732487
0.5 * textContentItem.width /* not the same column */
24742488
) {
24752489
appendEOL();
2476-
return;
2490+
return true;
24772491
}
24782492

24792493
flushTextContentItem();
2480-
return;
2494+
return true;
24812495
}
24822496

24832497
if (Math.abs(advanceX) > textContentItem.width) {
24842498
appendEOL();
2485-
return;
2499+
return true;
24862500
}
24872501
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
24882502
textContentItem.height += advanceY;
@@ -2508,7 +2522,7 @@ class PartialEvaluator {
25082522
}
25092523
}
25102524

2511-
return;
2525+
return true;
25122526
}
25132527

25142528
const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale;
@@ -2523,15 +2537,15 @@ class PartialEvaluator {
25232537
0.5 * textContentItem.height /* not the same line */
25242538
) {
25252539
appendEOL();
2526-
return;
2540+
return true;
25272541
}
25282542
flushTextContentItem();
2529-
return;
2543+
return true;
25302544
}
25312545

25322546
if (Math.abs(advanceY) > textContentItem.height) {
25332547
appendEOL();
2534-
return;
2548+
return true;
25352549
}
25362550

25372551
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
@@ -2553,6 +2567,8 @@ class PartialEvaluator {
25532567
textContentItem.width += advanceX;
25542568
}
25552569
}
2570+
2571+
return true;
25562572
}
25572573

25582574
function buildTextContentItem({ chars, extraSpacing }) {
@@ -2617,7 +2633,10 @@ class PartialEvaluator {
26172633
continue;
26182634
}
26192635

2620-
compareWithLastPosition();
2636+
if (!compareWithLastPosition()) {
2637+
// The glyph is not in page so just skip it.
2638+
continue;
2639+
}
26212640

26222641
// Must be called after compareWithLastPosition because
26232642
// the textContentItem could have been flushed.
@@ -3026,6 +3045,7 @@ class PartialEvaluator {
30263045
includeMarkedContent,
30273046
sink: sinkWrapper,
30283047
seenStyles,
3048+
viewBox,
30293049
})
30303050
.then(function () {
30313051
if (!sinkWrapper.enqueueInvoked) {

test/pdfs/bug1755201.pdf.link

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
https://bugzilla.mozilla.org/attachment.cgi?id=9263657

test/test_manifest.json

+6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
[
2+
{ "id": "bug1755201",
3+
"file": "pdfs/bug1755201.pdf",
4+
"md5": "cece14097812d8a1f69e86a51e4a3804",
5+
"rounds": 1,
6+
"type": "other"
7+
},
28
{ "id": "filled-background-range",
39
"file": "pdfs/filled-background.pdf",
410
"md5": "2e3120255d9c3e79b96d2543b12d2589",

test/unit/api_spec.js

+16
Original file line numberDiff line numberDiff line change
@@ -2219,6 +2219,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
22192219
await loadingTask.destroy();
22202220
});
22212221

2222+
it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () {
2223+
if (isNodeJS) {
2224+
pending("Linked test-cases are not supported in Node.js.");
2225+
}
2226+
2227+
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
2228+
const pdfDoc = await loadingTask.promise;
2229+
const pdfPage = await pdfDoc.getPage(6);
2230+
const { items } = await pdfPage.getTextContent();
2231+
const text = mergeText(items);
2232+
2233+
expect(/win aisle/.test(text)).toEqual(false);
2234+
2235+
await loadingTask.destroy();
2236+
});
2237+
22222238
it("gets empty structure tree", async function () {
22232239
const tree = await page.getStructTree();
22242240

0 commit comments

Comments
 (0)