Skip to content

Commit f38749a

Browse files
committed
Fix issues in text selection
- PR mozilla#13257 fixed a lot of issues but not all and this patch aims to fix almost all remaining issues. - the idea in this new patch is to compare position of new glyph with the last position where a glyph has been drawn; - no space are "drawn": it just moves the cursor but they aren't added in the chunk; - so this way a space followed by a cursor move can be treated as only one space: it helps to merge all spaces into one. - to make difference between real spaces and tracking ones, we used a factor of the space width (from the font) - it was a pretty good idea in general but it fails with some fonts where space was too big: - in Poppler, they're using a factor of the font size: this is an excellent idea (<= 0.1 * fontSize implies tracking space).
1 parent f5b79be commit f38749a

10 files changed

+269
-131
lines changed

src/core/evaluator.js

+146-126
Large diffs are not rendered by default.

src/display/text_layer.js

+3-2
Original file line numberDiff line numberDiff line change
@@ -185,10 +185,11 @@ function appendText(task, geom, styles, ctx) {
185185
let shouldScaleText = false;
186186
if (
187187
geom.str.length > 1 ||
188-
(task._enhanceTextSelection && AllWhitespaceRegexp.test(geom.str))
188+
(task._enhanceTextSelection &&
189+
(geom.isOneSpace || AllWhitespaceRegexp.test(geom.str)))
189190
) {
190191
shouldScaleText = true;
191-
} else if (geom.transform[0] !== geom.transform[3]) {
192+
} else if (!geom.isOneSpace && geom.transform[0] !== geom.transform[3]) {
192193
const absScaleX = Math.abs(geom.transform[0]),
193194
absScaleY = Math.abs(geom.transform[3]);
194195
// When the horizontal/vertical scaling differs significantly, also scale

test/pdfs/.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
!issue1155r.pdf
1414
!issue2017r.pdf
1515
!bug1727053.pdf
16+
!issue11913.pdf
1617
!issue2391-1.pdf
1718
!issue2391-2.pdf
1819
!issue14046.pdf
@@ -182,6 +183,7 @@
182183
!issue11931.pdf
183184
!issue1655r.pdf
184185
!issue6541.pdf
186+
!issue10640.pdf
185187
!issue2948.pdf
186188
!issue6231_1.pdf
187189
!issue10402.pdf
@@ -285,6 +287,7 @@
285287
!issue2840.pdf
286288
!issue4061.pdf
287289
!issue4668.pdf
290+
!issue13226.pdf
288291
!PDFJS-7562-reduced.pdf
289292
!issue11768_reduced.pdf
290293
!issue5039.pdf
@@ -440,6 +443,7 @@
440443
!annotation-fileattachment.pdf
441444
!annotation-text-widget.pdf
442445
!annotation-choice-widget.pdf
446+
!issue10900.pdf
443447
!annotation-button-widget.pdf
444448
!annotation-polyline-polygon.pdf
445449
!annotation-polyline-polygon-without-appearance.pdf
@@ -462,6 +466,7 @@
462466
!issue9972-3.pdf
463467
!tiling-pattern-box.pdf
464468
!tiling-pattern-large-steps.pdf
469+
!issue13201.pdf
465470
!issue11555.pdf
466471
!issue12337.pdf
467472
!pr12564.pdf

test/pdfs/issue10640.pdf

49.1 KB
Binary file not shown.

test/pdfs/issue10900.pdf

33.3 KB
Binary file not shown.

test/pdfs/issue11913.pdf

32.8 KB
Binary file not shown.

test/pdfs/issue13201.pdf

146 KB
Binary file not shown.

test/pdfs/issue13226.pdf

972 Bytes
Binary file not shown.

test/unit/api_spec.js

+114-2
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ describe("api", function () {
7373
}, WAIT_TIMEOUT);
7474
}
7575

76+
function mergeText(items) {
77+
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
78+
}
79+
7680
describe("getDocument", function () {
7781
it("creates pdf doc from URL-string", async function () {
7882
const urlStr = TEST_PDFS_PATH + basicApiFileName;
@@ -1604,11 +1608,17 @@ describe("api", function () {
16041608
const data = await Promise.all([defaultPromise, parametersPromise]);
16051609

16061610
expect(!!data[0].items).toEqual(true);
1607-
expect(data[0].items.length).toEqual(12);
1611+
expect(data[0].items.length).toEqual(11);
16081612
expect(!!data[0].styles).toEqual(true);
16091613

1614+
const page1 = mergeText(data[0].items);
1615+
expect(page1).toEqual(`Table Of Content
1616+
Chapter 1 .......................................................... 2
1617+
Paragraph 1.1 ...................................................... 3
1618+
page 1 / 3`);
1619+
16101620
expect(!!data[1].items).toEqual(true);
1611-
expect(data[1].items.length).toEqual(7);
1621+
expect(data[1].items.length).toEqual(6);
16121622
expect(!!data[1].styles).toEqual(true);
16131623
});
16141624

@@ -1632,6 +1642,7 @@ describe("api", function () {
16321642
transform: [18, 0, 0, 18, 441.81, 708.4499999999999],
16331643
width: 77.49,
16341644
hasEOL: false,
1645+
isOneSpace: false,
16351646
});
16361647
expect(styles[fontName]).toEqual({
16371648
fontFamily: "serif",
@@ -1643,6 +1654,107 @@ describe("api", function () {
16431654
await loadingTask.destroy();
16441655
});
16451656

1657+
it("gets text content, with no extra spaces (issue 13226)", async function () {
1658+
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
1659+
const pdfDoc = await loadingTask.promise;
1660+
const pdfPage = await pdfDoc.getPage(1);
1661+
const { items } = await pdfPage.getTextContent();
1662+
const text = mergeText(items);
1663+
1664+
expect(text).toEqual(
1665+
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
1666+
);
1667+
1668+
await loadingTask.destroy();
1669+
});
1670+
1671+
it("gets text content, with merged spaces (issue 13201)", async function () {
1672+
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
1673+
const pdfDoc = await loadingTask.promise;
1674+
const pdfPage = await pdfDoc.getPage(1);
1675+
const { items } = await pdfPage.getTextContent();
1676+
const text = mergeText(items);
1677+
1678+
expect(
1679+
text.includes(
1680+
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
1681+
)
1682+
).toEqual(true);
1683+
expect(
1684+
text.includes(
1685+
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
1686+
)
1687+
).toEqual(true);
1688+
expect(
1689+
text.includes(
1690+
"system is secure as long as honest nodes collectively control more CPU power than any"
1691+
)
1692+
).toEqual(true);
1693+
1694+
await loadingTask.destroy();
1695+
});
1696+
1697+
it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
1698+
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
1699+
const pdfDoc = await loadingTask.promise;
1700+
const pdfPage = await pdfDoc.getPage(1);
1701+
const { items } = await pdfPage.getTextContent();
1702+
const text = mergeText(items);
1703+
1704+
expect(
1705+
text.includes(
1706+
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
1707+
)
1708+
).toEqual(true);
1709+
expect(
1710+
text.includes(
1711+
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
1712+
)
1713+
).toEqual(true);
1714+
await loadingTask.destroy();
1715+
});
1716+
1717+
it("gets text content, with merged spaces (issue 10900)", async function () {
1718+
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
1719+
const pdfDoc = await loadingTask.promise;
1720+
const pdfPage = await pdfDoc.getPage(1);
1721+
const { items } = await pdfPage.getTextContent();
1722+
const text = mergeText(items);
1723+
1724+
expect(
1725+
text.includes(`3 3 3 3
1726+
851.5 854.9 839.3 837.5
1727+
633.6 727.8 789.9 796.2
1728+
1,485.1 1,582.7 1,629.2 1,633.7
1729+
114.2 121.7 125.3 130.7
1730+
13.0x 13.0x 13.0x 12.5x`)
1731+
).toEqual(true);
1732+
1733+
await loadingTask.destroy();
1734+
});
1735+
1736+
it("gets text content, with spaces (issue 10640)", async function () {
1737+
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
1738+
const pdfDoc = await loadingTask.promise;
1739+
const pdfPage = await pdfDoc.getPage(1);
1740+
const { items } = await pdfPage.getTextContent();
1741+
const text = mergeText(items);
1742+
1743+
expect(
1744+
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
1745+
Open Sans was designed with an upright stress, open forms and a neu-
1746+
tral, yet friendly appearance. It was optimized for print, web, and mobile
1747+
interfaces, and has excellent legibility characteristics in its letterforms (see
1748+
figure \x81 on the following page). This font is available from the Google Font
1749+
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
1750+
This package provides support for this font in LATEX. It includes Type \x81
1751+
versions of the fonts, converted for this package using FontForge from its
1752+
sources, for full support with Dvips.`)
1753+
).toEqual(true);
1754+
1755+
await loadingTask.destroy();
1756+
});
1757+
16461758
it("gets empty structure tree", async function () {
16471759
const tree = await page.getStructTree();
16481760

test/unit/pdf_find_controller_spec.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ describe("pdf_find_controller", function () {
268268
pageIndex: 0,
269269
matchIndex: 0,
270270
},
271-
pageMatches: [[19, 48, 66]],
271+
pageMatches: [[19, 46, 62]],
272272
pageMatchesLength: [[8, 8, 8]],
273273
});
274274
});

0 commit comments

Comments
 (0)