Skip to content

Commit 2e46544

Browse files
committed
Add text field to shape selectors
Add a field to shape selectors containing the text that intersects a shape, using word-level granularity.
1 parent bbed920 commit 2e46544

File tree

5 files changed

+259
-11
lines changed

5 files changed

+259
-11
lines changed

src/annotator/anchoring/pdf.ts

+43-8
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import type {
1717
import { translateOffsets } from '../util/normalize';
1818
import { matchQuote } from './match-quote';
1919
import { createPlaceholder } from './placeholder';
20+
import { textInDOMRect } from './text-in-rect';
2021
import { TextPosition, TextRange } from './text-range';
2122
import { TextQuoteAnchor } from './types';
2223

@@ -128,6 +129,12 @@ async function getPageView(pageIndex: number): Promise<PDFPageView> {
128129
return pageView!;
129130
}
130131

132+
function getTextLayerFromPoint(x: number, y: number): HTMLElement | undefined {
133+
return document
134+
.elementsFromPoint(x, y)
135+
.find(el => el.classList.contains('textLayer')) as HTMLElement | undefined;
136+
}
137+
131138
/**
132139
* Return true if the document has selectable text.
133140
*/
@@ -848,6 +855,13 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
848855
};
849856
};
850857

858+
const textFromRect = (textLayer: HTMLElement, rect: DOMRect) => {
859+
// Set a limit on how much text is included in thumbnails, to avoid shape
860+
// selector objects becoming too large.
861+
const maxTextLen = 256;
862+
return textInDOMRect(textLayer, rect).slice(0, maxTextLen);
863+
};
864+
851865
switch (shape.type) {
852866
case 'rect': {
853867
const [topLeft, bottomRight] = await Promise.all([
@@ -866,20 +880,34 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
866880
}
867881

868882
const pageView = await getPageView(topLeft.pageIndex);
883+
const pdfRect = {
884+
type: 'rect',
885+
left: topLeft.x,
886+
top: topLeft.y,
887+
right: bottomRight.x,
888+
bottom: bottomRight.y,
889+
} as const;
890+
891+
const textLayer = getTextLayerFromPoint(shape.left, shape.top);
892+
let text;
893+
if (textLayer) {
894+
const rect = new DOMRect(
895+
shape.left,
896+
shape.top,
897+
shape.right - shape.left,
898+
shape.bottom - shape.top,
899+
);
900+
text = textFromRect(textLayer, rect);
901+
}
869902

870903
return [
871904
createPageSelector(pageView, topLeft.pageIndex),
872905
{
873906
type: 'ShapeSelector',
874907
anchor: 'page',
875-
shape: {
876-
type: 'rect',
877-
left: topLeft.x,
878-
top: topLeft.y,
879-
right: bottomRight.x,
880-
bottom: bottomRight.y,
881-
},
908+
shape: pdfRect,
882909
view: pageBoundingBox(pageView.pdfPage),
910+
text,
883911
},
884912
];
885913
}
@@ -889,8 +917,14 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
889917
throw new Error('Point is not in a page');
890918
}
891919

892-
const pageView = await getPageView(point.pageIndex);
920+
const textLayer = getTextLayerFromPoint(shape.x, shape.y);
921+
let text;
922+
if (textLayer) {
923+
const rect = new DOMRect(shape.x, shape.y, 1, 1);
924+
text = textFromRect(textLayer, rect);
925+
}
893926

927+
const pageView = await getPageView(point.pageIndex);
894928
return [
895929
createPageSelector(pageView, point.pageIndex),
896930
{
@@ -901,6 +935,7 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
901935
x: point.x,
902936
y: point.y,
903937
},
938+
text,
904939
view: pageBoundingBox(pageView.pdfPage),
905940
},
906941
];

src/annotator/anchoring/test/pdf-test.js

+87-3
Original file line numberDiff line numberDiff line change
@@ -964,10 +964,26 @@ describe('annotator/anchoring/pdf', () => {
964964

965965
describe('describeShape', () => {
966966
let elementsFromPoint;
967+
let textLayer;
968+
let fakeTextInDOMRect;
967969

968970
const borderLeft = 5;
969971
const borderTop = 8;
970972

973+
// Create a matcher for a `DOMRect`.
974+
//
975+
// Note that if you pass a `DOMRect` directly to eg. `assert.calledWith`,
976+
// the match will always succeed, whether the values are equal or not.
977+
const matchRect = expected =>
978+
sinon.match(
979+
rect =>
980+
rect.x === expected.x &&
981+
rect.y === expected.y &&
982+
rect.width === expected.width &&
983+
rect.height === expected.height,
984+
`DOMRect { x=${expected.x}, y=${expected.y} width=${expected.width} height=${expected.height} }`,
985+
);
986+
971987
beforeEach(() => {
972988
for (let i = 0; i < viewer.pdfViewer.pagesCount; i++) {
973989
const pageDiv = viewer.pdfViewer.getPageView(i).div;
@@ -979,6 +995,9 @@ describe('annotator/anchoring/pdf', () => {
979995
// which are not a PDF page container, are ignored.
980996
const dummy = document.createElement('div');
981997

998+
textLayer = document.createElement('div');
999+
textLayer.className = 'textLayer';
1000+
9821001
// Override `elementsFromPoint` to control how viewport coordinates are
9831002
// mapped to pages.
9841003
elementsFromPoint = sinon.stub(document, 'elementsFromPoint');
@@ -994,7 +1013,13 @@ describe('annotator/anchoring/pdf', () => {
9941013
}
9951014

9961015
const pageDiv = viewer.pdfViewer.getPageView(pageIndex).div;
997-
return [dummy, pageDiv];
1016+
return [dummy, textLayer, pageDiv];
1017+
});
1018+
1019+
fakeTextInDOMRect = sinon.stub().returns('text-in-shape');
1020+
1021+
pdfAnchoring.$imports.$mock({
1022+
'./text-in-rect': { textInDOMRect: fakeTextInDOMRect },
9981023
});
9991024
});
10001025

@@ -1024,6 +1049,11 @@ describe('annotator/anchoring/pdf', () => {
10241049
y: 10 + borderTop,
10251050
});
10261051

1052+
assert.calledWith(
1053+
fakeTextInDOMRect,
1054+
textLayer,
1055+
matchRect(new DOMRect(10 + borderLeft, 10 + borderTop, 1, 1)),
1056+
);
10271057
assert.deepEqual(selectors, [
10281058
{
10291059
type: 'PageSelector',
@@ -1044,9 +1074,21 @@ describe('annotator/anchoring/pdf', () => {
10441074
right: 100,
10451075
top: 200,
10461076
},
1077+
text: 'text-in-shape',
10471078
},
10481079
]);
10491080
});
1081+
1082+
it('does not extract text if there is no text layer', async () => {
1083+
textLayer.className = 'notTheTextLayer';
1084+
const selectors = await describeShape({
1085+
type: 'point',
1086+
x: 10 + borderLeft,
1087+
y: 10 + borderTop,
1088+
});
1089+
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
1090+
assert.isUndefined(shapeSelector.text);
1091+
});
10501092
});
10511093

10521094
context('when shape is a rect', () => {
@@ -1090,14 +1132,29 @@ describe('annotator/anchoring/pdf', () => {
10901132
const [expectedLeft, expectedTop] = pageView.getPagePoint(10, 10);
10911133
const [expectedRight, expectedBottom] = pageView.getPagePoint(30, 50);
10921134

1093-
const selectors = await describeShape({
1094-
type: 'rect',
1135+
const rect = {
10951136
left: 10 + borderLeft,
10961137
top: 10 + borderTop,
10971138
right: 30 + borderLeft,
10981139
bottom: 50 + borderTop,
1140+
};
1141+
const selectors = await describeShape({
1142+
type: 'rect',
1143+
...rect,
10991144
});
11001145

1146+
assert.calledWith(
1147+
fakeTextInDOMRect,
1148+
textLayer,
1149+
matchRect(
1150+
new DOMRect(
1151+
rect.left,
1152+
rect.top,
1153+
rect.right - rect.left,
1154+
rect.bottom - rect.top,
1155+
),
1156+
),
1157+
);
11011158
assert.deepEqual(selectors, [
11021159
{
11031160
type: 'PageSelector',
@@ -1120,11 +1177,38 @@ describe('annotator/anchoring/pdf', () => {
11201177
right: 100,
11211178
top: 200,
11221179
},
1180+
text: 'text-in-shape',
11231181
},
11241182
]);
11251183
});
11261184
});
11271185

1186+
it('does not extract text if there is no text layer', async () => {
1187+
textLayer.className = 'notTheTextLayer';
1188+
const selectors = await describeShape({
1189+
type: 'rect',
1190+
left: 10 + borderLeft,
1191+
top: 10 + borderTop,
1192+
right: 30 + borderLeft,
1193+
bottom: 50 + borderTop,
1194+
});
1195+
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
1196+
assert.isUndefined(shapeSelector.text);
1197+
});
1198+
1199+
it('truncates extracted text', async () => {
1200+
fakeTextInDOMRect.returns('a'.repeat(300));
1201+
const selectors = await describeShape({
1202+
type: 'rect',
1203+
left: 10 + borderLeft,
1204+
top: 10 + borderTop,
1205+
right: 100,
1206+
bottom: 100,
1207+
});
1208+
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
1209+
assert.equal(shapeSelector.text, 'a'.repeat(256));
1210+
});
1211+
11281212
it('throws if shape is unsupported', async () => {
11291213
let err;
11301214
try {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import { textInDOMRect } from '../text-in-rect';
2+
3+
describe('textInDOMRect', () => {
4+
let container;
5+
6+
beforeEach(() => {
7+
container = document.createElement('div');
8+
container.style.position = 'fixed';
9+
10+
const leftColumn = document.createElement('p');
11+
leftColumn.className = 'left-column';
12+
Object.assign(leftColumn.style, {
13+
position: 'absolute',
14+
width: '200px',
15+
});
16+
leftColumn.append('Line one', document.createElement('br'), 'Line two');
17+
18+
const rightColumn = document.createElement('p');
19+
rightColumn.className = 'right-column';
20+
Object.assign(rightColumn.style, {
21+
position: 'absolute',
22+
width: '200px',
23+
left: '200px',
24+
});
25+
rightColumn.append('Line three', document.createElement('br'), 'Line four');
26+
27+
container.append(leftColumn, rightColumn);
28+
29+
document.body.append(container);
30+
});
31+
32+
afterEach(() => {
33+
container.remove();
34+
});
35+
36+
[
37+
// Rect covering whole left column
38+
{
39+
rect: new DOMRect(0, 0, 200, 200),
40+
expected: 'Line one Line two',
41+
},
42+
// Rect covering whole right column
43+
{
44+
rect: new DOMRect(200, 0, 200, 200),
45+
expected: 'Line three Line four',
46+
},
47+
// Tiny rect touching first word in left column
48+
{
49+
rect: new DOMRect(10, 10, 1, 1),
50+
expected: 'Line',
51+
},
52+
// Zero-sized rect touching first word in left column
53+
{
54+
rect: new DOMRect(10, 10, 0, 0),
55+
expected: '',
56+
},
57+
].forEach(({ rect, expected }) => {
58+
it('returns text in rect', () => {
59+
const text = textInDOMRect(container, rect);
60+
assert.equal(text, expected);
61+
});
62+
});
63+
64+
it('only returns text from root container', () => {
65+
const leftColumn = container.querySelector('.left-column');
66+
const text = textInDOMRect(leftColumn, new DOMRect(0, 0, 500, 500));
67+
assert.equal(text, 'Line one Line two');
68+
});
69+
});
+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import { rectIntersects, rectsOverlapVertically } from '../util/geometry';
2+
3+
/**
4+
* Return the DOM text that intersects a given rect.
5+
*
6+
* The text nodes under {@link root} are split into chunks and the bounding
7+
* rectangle of the chunk is intersected with {@link rect}. If the intersection
8+
* is non-empty, the text of that chunk is added to the output string.
9+
*
10+
* @param root - Root element of the DOM tree to search
11+
* @param rect - Client coordinates of the region
12+
*/
13+
export function textInDOMRect(root: Element, rect: DOMRect): string {
14+
const iter = root.ownerDocument!.createNodeIterator(
15+
root,
16+
NodeFilter.SHOW_TEXT,
17+
);
18+
const textChunks = [];
19+
20+
// Rect for previous text chunk which was included in the output.
21+
let prevChunkRect;
22+
23+
let currentNode;
24+
while ((currentNode = iter.nextNode())) {
25+
const textNode = currentNode as Text;
26+
27+
// We split on word boundaries here rather than spaces, so inter-word spaces
28+
// are included in the "words".
29+
const words = textNode.data.split(/\b/);
30+
let offset = 0;
31+
32+
for (const word of words) {
33+
const range = new Range();
34+
range.setStart(textNode, offset);
35+
const endOffset = offset + word.length;
36+
range.setEnd(textNode, endOffset);
37+
const wordRect = range.getBoundingClientRect();
38+
39+
if (rectIntersects(wordRect, rect)) {
40+
// We assume that spaces are included in the text between words on a
41+
// line, but not between lines.
42+
const newLine =
43+
prevChunkRect && !rectsOverlapVertically(prevChunkRect, wordRect);
44+
if (newLine) {
45+
textChunks.push(' ');
46+
}
47+
48+
textChunks.push(word);
49+
prevChunkRect = wordRect;
50+
}
51+
52+
offset = endOffset;
53+
}
54+
}
55+
56+
return textChunks.join('');
57+
}

src/types/api.ts

+3
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ export type ShapeSelector = {
212212
right: number;
213213
bottom: number;
214214
};
215+
216+
/** The text contained inside this shape. */
217+
text?: string;
215218
};
216219

217220
/**

0 commit comments

Comments
 (0)