Skip to content

Commit ee75402

Browse files
author
speedplane
committed
Improve copy/paste by inserting spaces into textChunks if we deem it appropriate.
Add test re same. PR mozilla#5783.
1 parent fa0f09b commit ee75402

File tree

6 files changed

+153
-0
lines changed

6 files changed

+153
-0
lines changed

src/core/evaluator.js

+69
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,9 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
893893
styles: Object.create(null)
894894
};
895895
var bidiTexts = textContent.items;
896+
// At the end of each textChunk, auto insert spaces based on:
897+
var SPACE_FACTOR_CHUNKS = 0.6;
898+
// If performing a spacedText operation, auto insert spaces based on:
896899
var SPACE_FACTOR = 0.35;
897900
var MULTI_SPACE_FACTOR = 1.5;
898901

@@ -1037,8 +1040,73 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
10371040
} else {
10381041
textChunk.height += Math.abs(height * scaleCtmX * scaleLineX);
10391042
}
1043+
addSpaceIfNecessary(textChunk, font);
10401044
return textChunk;
10411045
}
1046+
function getChunkPosition(chunk, font) {
1047+
var tx = chunk.transform;
1048+
var angle = Math.atan2(tx[1], tx[0]);
1049+
if (font.vertical) {
1050+
angle += Math.PI / 2;
1051+
}
1052+
// Start by calculating the height
1053+
var fontAscent = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
1054+
// Then modify to add the ascent
1055+
if (font.ascent) {
1056+
fontAscent = font.ascent * fontAscent;
1057+
} else if (font.descent) {
1058+
fontAscent = (1 + font.descent) * fontAscent;
1059+
}
1060+
return {
1061+
x: (angle === 0 ? tx[4] : tx[4] + (fontAscent * Math.sin(angle))),
1062+
y: tx[5]
1063+
};
1064+
}
1065+
function addSpaceIfNecessary(newChunk, font) {
1066+
// If the new chunk starts with a space, it does not need one.
1067+
if (newChunk.str[0] === ' ' || newChunk.str[0] === '-') {
1068+
return;
1069+
}
1070+
if (bidiTexts.length === 0) {
1071+
return;
1072+
}
1073+
// If the last chunk ends with a space it does not need one.
1074+
var lastChunk = bidiTexts[bidiTexts.length - 1];
1075+
if (lastChunk.str.length === 0) {
1076+
return;
1077+
}
1078+
var lastChar = lastChunk.str[lastChunk.str.length - 1];
1079+
if (lastChar === ' ' || lastChar === '-') {
1080+
return;
1081+
}
1082+
var lastPosition = getChunkPosition(lastChunk, font);
1083+
var newPosition = getChunkPosition(newChunk, font);
1084+
var yDiff = Math.abs(lastPosition.y - newPosition.y);
1085+
if (yDiff >= lastChunk.height || yDiff >= newChunk.height) {
1086+
// On different lines, add a space.
1087+
lastChunk.str += ' ';
1088+
} else {
1089+
var wordSpacing = textState.wordSpacing; // Try default wordSpacing.
1090+
if (wordSpacing === 0) {
1091+
// Heuristic for wordSpacing
1092+
wordSpacing = newChunk.width / newChunk.str.length *
1093+
SPACE_FACTOR_CHUNKS;
1094+
}
1095+
var addSpace;
1096+
if (newPosition.x >= lastPosition.x) {
1097+
// Left to right. Add a space if next is past wordSpacing.
1098+
addSpace = newPosition.x >= lastPosition.x + lastChunk.width +
1099+
wordSpacing;
1100+
} else {
1101+
// Right to left. Add space if next is before sart.
1102+
addSpace = lastPosition.x >= newPosition.x + newChunk.width +
1103+
wordSpacing;
1104+
}
1105+
if (addSpace) {
1106+
lastChunk.str += ' ';
1107+
}
1108+
}
1109+
}
10421110

10431111
var timeSlotManager = new TimeSlotManager();
10441112

@@ -1121,6 +1189,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
11211189
textState.translateTextMatrix(0, offset);
11221190
textChunk.height += offset;
11231191
}
1192+
// Automatically insert spaces if the shift is big enough.
11241193
if (items[j] < 0 && textState.font.spaceWidth > 0) {
11251194
var fakeSpaces = -items[j] / textState.font.spaceWidth;
11261195
if (fakeSpaces > MULTI_SPACE_FACTOR) {

test/pdfs/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,4 @@
118118
!issue5481.pdf
119119
!issue5567.pdf
120120
!issue5701.pdf
121+
!US6205527_page1.pdf

test/pdfs/US6205527_page1.pdf

521 KB
Binary file not shown.

test/unit/api_spec.js

+4
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,10 @@ describe('api', function() {
228228
expect(!!data.items).toEqual(true);
229229
expect(data.items.length).toEqual(7);
230230
expect(!!data.styles).toEqual(true);
231+
232+
// Make sure the text is ordered properly.
233+
expect(data.items[1].str).toEqual('Table Of Content ');
234+
expect(data.items[6].str.replace(/^\s+/,'')).toEqual('page 1 / 3');
231235
});
232236
});
233237
it('gets operator list', function() {

test/unit/text_extract.js

+78
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2+
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
3+
/* globals PDFJS, expect, it, describe, Promise, combineUrl, waitsFor,
4+
isArray, MissingPDFException */
5+
6+
'use strict';
7+
8+
function waitsForPromiseResolved(promise, successCallback) {
9+
var data;
10+
promise.then(function(val) {
11+
data = val;
12+
successCallback(data);
13+
},
14+
function(error) {
15+
// Shouldn't get here.
16+
expect(false).toEqual(true);
17+
});
18+
waitsFor(function() {
19+
return data !== undefined;
20+
}, 20000);
21+
}
22+
23+
function getPageOneOf(pdf) {
24+
var pdfURL = combineUrl(window.location.href, pdf);
25+
var resolvePromise;
26+
var pagePromise = new Promise(function (resolve) {
27+
resolvePromise = resolve;
28+
});
29+
PDFJS.getDocument(pdfURL).then(function(doc) {
30+
doc.getPage(1).then(function(data) {
31+
resolvePromise(data);
32+
});
33+
});
34+
var page = {
35+
promise: pagePromise,
36+
page: page
37+
};
38+
waitsForPromiseResolved(pagePromise, function(data) {
39+
page.page = data;
40+
});
41+
return page;
42+
}
43+
44+
describe('text-extract', function() {
45+
it('patent', function () {
46+
var page = getPageOneOf('../pdfs/US6205527_page1.pdf');
47+
waitsForPromiseResolved(page.promise, function (data) {
48+
var textPromise = page.page.getTextContent();
49+
waitsForPromiseResolved(textPromise, function (data) {
50+
expect(!!data.items).toEqual(true);
51+
var text = data.items.map(function (d) { return d.str; }).join('');
52+
// Make sure the text is ordered properly.
53+
expect(text.indexOf('Disclosed is an apparatus, a system, a') > 0)
54+
.toEqual(true);
55+
expect(text.indexOf('device to the computer system; (b) preparing ' +
56+
'a storage. media of the peripheral storage') > 0).toEqual(true);
57+
});
58+
});
59+
});
60+
61+
it('tracemonkey', function () {
62+
var page = getPageOneOf('../pdfs/tracemonkey.pdf');
63+
waitsForPromiseResolved(page.promise, function (data) {
64+
var textPromise = page.page.getTextContent();
65+
waitsForPromiseResolved(textPromise, function (data) {
66+
expect(!!data.items).toEqual(true);
67+
var text = data.items.map(function (d) { return d.str; }).join('');
68+
// Make sure the text is ordered properly.
69+
expect(text.indexOf('no concrete type information is available') > 0)
70+
.toEqual(true);
71+
expect(text.indexOf('difficult to com-pile than statically ') > 0)
72+
.toEqual(true);
73+
expect(text.indexOf('this work for personal or classroom use is') > 0)
74+
.toEqual(true);
75+
});
76+
});
77+
});
78+
});

test/unit/unit_test.html

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
<script src="stream_spec.js"></script>
5252
<script src="parser_spec.js"></script>
5353
<script src="api_spec.js"></script>
54+
<script src="text_extract.js"></script>
5455
<script src="metadata_spec.js"></script>
5556
<script src="util_spec.js"></script>
5657
<script src="cmap_spec.js"></script>

0 commit comments

Comments
 (0)