Skip to content

Commit de6282c

Browse files
committed
[text selection] Add the whitespaces present in the pdf in the text chunk
- it aims to fix issue #14627; - the basic idea of the recent text refactoring was to only consider the rendered visible whitespaces. But sometimes, the heuristics aren't correct and although some whitespaces are in the text stream they weren't in the text chunks because they were too small. Hence we added some exceptions, for example, we always add a whitespace when it is between two non-whitespace chars but only when in the same Tj. So basically, this patch removes the constraint to have the chars in the same Tj (in using a cyclic buffer to save the two last chars) but don't add a space when the visible space is really too small (hence `NOT_A_SPACE_FACTOR`).
1 parent db4f3ad commit de6282c

File tree

6 files changed

+85
-23
lines changed

6 files changed

+85
-23
lines changed

src/core/evaluator.js

+69-14
Original file line numberDiff line numberDiff line change
@@ -2187,17 +2187,50 @@ class PartialEvaluator {
21872187
spaceInFlowMax: 0,
21882188
trackingSpaceMin: Infinity,
21892189
negativeSpaceMax: -Infinity,
2190+
notASpace: -Infinity,
21902191
transform: null,
21912192
fontName: null,
21922193
hasEOL: false,
21932194
};
21942195

2196+
// Use a cyclic buffer (length === 2) to save the last chars in the
2197+
// text stream.
2198+
// It's useful to know when we need to add a whitespace in the
2199+
// text chunk.
2200+
const twoLastChars = [" ", " "];
2201+
let twoLastCharsPos = 0;
2202+
2203+
/**
2204+
* Save the last char.
2205+
* @param {string} char
2206+
* @returns {boolean} true when the two last chars before adding the new one
2207+
* are a non-whitespace followed by a whitespace.
2208+
*/
2209+
function saveLastChar(char) {
2210+
const nextPos = (twoLastCharsPos + 1) % 2;
2211+
const ret =
2212+
twoLastChars[twoLastCharsPos] !== " " && twoLastChars[nextPos] === " ";
2213+
twoLastChars[twoLastCharsPos] = char;
2214+
twoLastCharsPos = nextPos;
2215+
2216+
return ret;
2217+
}
2218+
2219+
function resetLastChars() {
2220+
twoLastChars[0] = twoLastChars[1] = " ";
2221+
twoLastCharsPos = 0;
2222+
}
2223+
21952224
// Used in addFakeSpaces.
21962225

21972226
// A white <= fontSize * TRACKING_SPACE_FACTOR is a tracking space
21982227
// so it doesn't count as a space.
21992228
const TRACKING_SPACE_FACTOR = 0.1;
22002229

2230+
// When a white <= fontSize * NOT_A_SPACE_FACTOR, there is no space
2231+
// even if one is present in the text stream.
2232+
const NOT_A_SPACE_FACTOR = 0.03;
2233+
22012234
// A negative white < fontSize * NEGATIVE_SPACE_FACTOR induces
22022235
// a break (a new chunk of text is created).
22032236
// It doesn't change anything when the text is copied but
@@ -2299,6 +2332,7 @@ class PartialEvaluator {
22992332

23002333
textContentItem.trackingSpaceMin =
23012334
textState.fontSize * TRACKING_SPACE_FACTOR;
2335+
textContentItem.notASpace = textState.fontSize * NOT_A_SPACE_FACTOR;
23022336
textContentItem.negativeSpaceMax =
23032337
textState.fontSize * NEGATIVE_SPACE_FACTOR;
23042338
textContentItem.spaceInFlowMin =
@@ -2483,6 +2517,7 @@ class PartialEvaluator {
24832517
return true;
24842518
}
24852519

2520+
resetLastChars();
24862521
flushTextContentItem();
24872522
return true;
24882523
}
@@ -2491,7 +2526,17 @@ class PartialEvaluator {
24912526
appendEOL();
24922527
return true;
24932528
}
2529+
2530+
if (advanceY <= textOrientation * textContentItem.notASpace) {
2531+
// The real spacing between 2 consecutive chars is thin enough to be
2532+
// considered a non-space.
2533+
resetLastChars();
2534+
}
2535+
24942536
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
2537+
if (advanceY <= textContentItem.notASpace) {
2538+
resetLastChars();
2539+
}
24952540
textContentItem.height += advanceY;
24962541
} else if (
24972542
!addFakeSpaces(
@@ -2501,6 +2546,7 @@ class PartialEvaluator {
25012546
)
25022547
) {
25032548
if (textContentItem.str.length === 0) {
2549+
resetLastChars();
25042550
textContent.items.push({
25052551
str: " ",
25062552
dir: "ltr",
@@ -2532,6 +2578,10 @@ class PartialEvaluator {
25322578
appendEOL();
25332579
return true;
25342580
}
2581+
2582+
// We're moving back so in case the last char was a whitespace
2583+
// we cancel it: it doesn't make sense to insert it.
2584+
resetLastChars();
25352585
flushTextContentItem();
25362586
return true;
25372587
}
@@ -2541,12 +2591,19 @@ class PartialEvaluator {
25412591
return true;
25422592
}
25432593

2594+
if (advanceX <= textOrientation * textContentItem.notASpace) {
2595+
// The real spacing between 2 consecutive chars is thin enough to be
2596+
// considered a non-space.
2597+
resetLastChars();
2598+
}
2599+
25442600
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
25452601
textContentItem.width += advanceX;
25462602
} else if (
25472603
!addFakeSpaces(advanceX, textContentItem.prevTransform, textOrientation)
25482604
) {
25492605
if (textContentItem.str.length === 0) {
2606+
resetLastChars();
25502607
textContent.items.push({
25512608
str: " ",
25522609
dir: "ltr",
@@ -2600,14 +2657,7 @@ class PartialEvaluator {
26002657
}
26012658
let scaledDim = glyphWidth * scale;
26022659

2603-
if (
2604-
glyph.isWhitespace &&
2605-
(i === 0 ||
2606-
i + 1 === ii ||
2607-
glyphs[i - 1].isWhitespace ||
2608-
glyphs[i + 1].isWhitespace ||
2609-
extraSpacing)
2610-
) {
2660+
if (glyph.isWhitespace) {
26112661
// Don't push a " " in the textContentItem
26122662
// (except when it's between two non-spaces chars),
26132663
// it will be done (if required) in next call to
@@ -2623,6 +2673,7 @@ class PartialEvaluator {
26232673
charSpacing += -scaledDim + textState.wordSpacing;
26242674
textState.translateTextMatrix(0, -charSpacing);
26252675
}
2676+
saveLastChar(" ");
26262677
continue;
26272678
}
26282679

@@ -2653,17 +2704,18 @@ class PartialEvaluator {
26532704
textChunk.prevTransform = getCurrentTextTransform();
26542705
}
26552706

2656-
if (glyph.isWhitespace) {
2707+
let glyphUnicode = glyph.unicode;
2708+
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
2709+
glyphUnicode = reverseIfRtl(glyphUnicode);
2710+
if (saveLastChar(glyphUnicode)) {
2711+
// The two last chars are a non-whitespace followed by a whitespace
2712+
// and then this non-whitespace, so we insert a whitespace here.
26572713
// Replaces all whitespaces with standard spaces (0x20), to avoid
26582714
// alignment issues between the textLayer and the canvas if the text
26592715
// contains e.g. tabs (fixes issue6612.pdf).
26602716
textChunk.str.push(" ");
2661-
} else {
2662-
let glyphUnicode = glyph.unicode;
2663-
glyphUnicode = NormalizedUnicodes[glyphUnicode] || glyphUnicode;
2664-
glyphUnicode = reverseIfRtl(glyphUnicode);
2665-
textChunk.str.push(glyphUnicode);
26662717
}
2718+
textChunk.str.push(glyphUnicode);
26672719

26682720
if (charSpacing) {
26692721
if (!font.vertical) {
@@ -2679,6 +2731,7 @@ class PartialEvaluator {
26792731
}
26802732

26812733
function appendEOL() {
2734+
resetLastChars();
26822735
if (textContentItem.initialized) {
26832736
textContentItem.hasEOL = true;
26842737
flushTextContentItem();
@@ -2701,6 +2754,7 @@ class PartialEvaluator {
27012754
width <= textOrientation * textContentItem.spaceInFlowMax
27022755
) {
27032756
if (textContentItem.initialized) {
2757+
resetLastChars();
27042758
textContentItem.str.push(" ");
27052759
}
27062760
return false;
@@ -2715,6 +2769,7 @@ class PartialEvaluator {
27152769
}
27162770

27172771
flushTextContentItem();
2772+
resetLastChars();
27182773
textContent.items.push({
27192774
str: " ",
27202775
// TODO: check if using the orientation from last chunk is

test/pdfs/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -515,3 +515,4 @@
515515
!issue14497.pdf
516516
!issue14502.pdf
517517
!issue13211.pdf
518+
!issue14627.pdf

test/pdfs/issue14627.pdf

61 KB
Binary file not shown.

test/test_manifest.json

+6
Original file line numberDiff line numberDiff line change
@@ -6329,5 +6329,11 @@
63296329
"md5": "d193853e8a123dc50eeea593a4150b60",
63306330
"rounds": 1,
63316331
"type": "eq"
6332+
},
6333+
{ "id": "issue14627",
6334+
"file": "pdfs/issue14627.pdf",
6335+
"md5": "5d1bfcc3b3130bfa7e33e43990e2213a",
6336+
"rounds": 1,
6337+
"type": "text"
63326338
}
63336339
]

test/unit/api_spec.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -1999,7 +1999,7 @@ describe("api", function () {
19991999
const data = await Promise.all([defaultPromise, parametersPromise]);
20002000

20012001
expect(!!data[0].items).toEqual(true);
2002-
expect(data[0].items.length).toEqual(11);
2002+
expect(data[0].items.length).toEqual(15);
20032003
expect(!!data[0].styles).toEqual(true);
20042004

20052005
const page1 = mergeText(data[0].items);

test/unit/pdf_find_controller_spec.js

+8-8
Original file line numberDiff line numberDiff line change
@@ -579,14 +579,14 @@ describe("pdf_find_controller", function () {
579579
},
580580
pageMatches: [
581581
[
582-
299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
583-
1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
584-
1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
585-
2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
586-
2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
587-
2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
588-
3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
589-
4071,
582+
302, 340, 418, 480, 627, 801, 982, 988, 1014, 1062, 1083, 1148, 1156,
583+
1277, 1345, 1393, 1401, 1423, 1499, 1523, 1529, 1685, 1775, 1787,
584+
1858, 1880, 1910, 1947, 2065, 2075, 2162, 2179, 2214, 2228, 2273,
585+
2323, 2359, 2401, 2412, 2423, 2462, 2531, 2537, 2552, 2561, 2575,
586+
2601, 2612, 2637, 2667, 2791, 2804, 2835, 2846, 2857, 2894, 2900,
587+
2914, 2938, 2958, 3088, 3235, 3245, 3335, 3383, 3390, 3464, 3473,
588+
3481, 3498, 3686, 3692, 3707, 3754, 3785, 3861, 3973, 4048, 4054,
589+
4067,
590590
],
591591
],
592592
pageMatchesLength: [

0 commit comments

Comments
 (0)