Skip to content

Commit c6da1ee

Browse files
author
speedplane
committed
Improve copy/paste by inserting spaces into textChunks if we deem it appropriate.
Add test re same. PR mozilla#5783.
1 parent fa0f09b commit c6da1ee

File tree

5 files changed

+112
-0
lines changed

5 files changed

+112
-0
lines changed

src/core/evaluator.js

+58
Original file line numberDiff line numberDiff line change
@@ -1037,8 +1037,66 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
10371037
} else {
10381038
textChunk.height += Math.abs(height * scaleCtmX * scaleLineX);
10391039
}
1040+
addSpaceIfNecessary(textChunk, font);
10401041
return textChunk;
10411042
}
1043+
function getChunkPos(chunk, font) {
1044+
var tx = chunk.transform;
1045+
var angle = Math.atan2(tx[1], tx[0]);
1046+
if (font.vertical) {
1047+
angle += Math.PI / 2;
1048+
}
1049+
var fontHeight = Math.sqrt((tx[2] * tx[2]) + (tx[3] * tx[3]));
1050+
var fontAscent = fontHeight;
1051+
if (font.ascent) {
1052+
fontAscent = font.ascent * fontAscent;
1053+
} else if (font.descent) {
1054+
fontAscent = (1 + font.descent) * fontAscent;
1055+
}
1056+
return {
1057+
x : (angle === 0) ? tx[4] :(tx[4] + (fontAscent * Math.sin(angle))),
1058+
y: tx[5]
1059+
};
1060+
}
1061+
function addSpaceIfNecessary(newChunk, font) {
1062+
// If the new chunk starts with a space, it does not need one.
1063+
if (newChunk.str[0] === ' ' || newChunk.str[0] === '-') {
1064+
return;
1065+
}
1066+
if (bidiTexts.length === 0) {
1067+
return;
1068+
}
1069+
// If the last chunk ends with a space it does not need one.
1070+
var lastChunk = bidiTexts[bidiTexts.length - 1];
1071+
if (lastChunk.str.length === 0) {
1072+
return;
1073+
}
1074+
var lastChar = lastChunk.str[lastChunk.str.length - 1];
1075+
if (lastChar === ' ' || lastChar === '-') {
1076+
return;
1077+
}
1078+
var lastPos = getChunkPos(lastChunk, font);
1079+
var newPos = getChunkPos(newChunk, font);
1080+
var yDiff = Math.abs(lastPos.y - newPos.y);
1081+
if (yDiff >= lastChunk.height || yDiff >= newChunk.height) {
1082+
// On different lines, add a space.
1083+
lastChunk.str += ' ';
1084+
} else {
1085+
var wordSpacing = textState.wordSpacing > 0 ?
1086+
// Standard wordSpacing
1087+
textState.wordSpacing:
1088+
// Hueristic for wordSpacing
1089+
newChunk.width / newChunk.str.length * 0.6;
1090+
var addSpace = newPos.x >= lastPos.x ?
1091+
// Left to right
1092+
newPos.x >= lastPos.x + lastChunk.width + wordSpacing:
1093+
// Right to left
1094+
lastPos.x >= newPos.x + newChunk.width + wordSpacing;
1095+
if (addSpace) {
1096+
lastChunk.str += ' ';
1097+
}
1098+
}
1099+
}
10421100

10431101
var timeSlotManager = new TimeSlotManager();
10441102

test/pdfs/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,4 @@
118118
!issue5481.pdf
119119
!issue5567.pdf
120120
!issue5701.pdf
121+
!US6205527_page1.pdf

test/pdfs/US6205527_page1.pdf

521 KB
Binary file not shown.

test/unit/text_extract.js

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2+
/* vim: set shiftwidth=2 tabstop=2 autoindent cindent expandtab: */
3+
/* globals PDFJS, expect, it, describe, Promise, combineUrl, waitsFor,
4+
isArray, MissingPDFException */
5+
6+
'use strict';
7+
8+
function waitsForPromiseResolved(promise, successCallback) {
9+
var data;
10+
promise.then(function(val) {
11+
data = val;
12+
successCallback(data);
13+
},
14+
function(error) {
15+
// Shouldn't get here.
16+
expect(false).toEqual(true);
17+
});
18+
waitsFor(function() {
19+
return data !== undefined;
20+
}, 20000);
21+
}
22+
23+
describe('text-extract', function() {
24+
var pdfURL = combineUrl(window.location.href, '../pdfs/US6205527_page1.pdf');
25+
var resolvePromise;
26+
var pagePromise = new Promise(function (resolve) {
27+
resolvePromise = resolve;
28+
});
29+
PDFJS.getDocument(pdfURL).then(function(doc) {
30+
doc.getPage(1).then(function(data) {
31+
resolvePromise(data);
32+
});
33+
});
34+
var page;
35+
waitsForPromiseResolved(pagePromise, function(data) {
36+
page = data;
37+
});
38+
it('gets text content', function () {
39+
waitsForPromiseResolved(pagePromise, function (data) {
40+
var textPromise = page.getTextContent();
41+
waitsForPromiseResolved(textPromise, function (data) {
42+
expect(!!data.items).toEqual(true);
43+
var text = data.items.map(function (d) { return d.str; }).join('');
44+
// Make sure the text is ordered properly.
45+
expect(text.indexOf('Disclosed is an apparatus, a system, a') > 0)
46+
.toEqual(true);
47+
expect(text.indexOf('device to the computer system; (b) preparing ' +
48+
'a storage. media of the peripheral storage') > 0).toEqual(true);
49+
});
50+
});
51+
});
52+
});

test/unit/unit_test.html

+1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
<script src="stream_spec.js"></script>
5252
<script src="parser_spec.js"></script>
5353
<script src="api_spec.js"></script>
54+
<script src="text_extract.js"></script>
5455
<script src="metadata_spec.js"></script>
5556
<script src="util_spec.js"></script>
5657
<script src="cmap_spec.js"></script>

0 commit comments

Comments
 (0)