Skip to content

Extract text from annotated shapes and use it in thumbnail alt text #7077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 43 additions & 8 deletions src/annotator/anchoring/pdf.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import type {
import { translateOffsets } from '../util/normalize';
import { matchQuote } from './match-quote';
import { createPlaceholder } from './placeholder';
import { textInDOMRect } from './text-in-rect';
import { TextPosition, TextRange } from './text-range';
import { TextQuoteAnchor } from './types';

Expand Down Expand Up @@ -128,6 +129,12 @@ async function getPageView(pageIndex: number): Promise<PDFPageView> {
return pageView!;
}

function getTextLayerFromPoint(x: number, y: number): HTMLElement | undefined {
return document
.elementsFromPoint(x, y)
.find(el => el.classList.contains('textLayer')) as HTMLElement | undefined;
}

/**
* Return true if the document has selectable text.
*/
Expand Down Expand Up @@ -848,6 +855,13 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
};
};

const textFromRect = (textLayer: HTMLElement, rect: DOMRect) => {
// Set a limit on how much text is included in thumbnails, to avoid shape
// selector objects becoming too large.
const maxTextLen = 256;
return textInDOMRect(textLayer, rect).slice(0, maxTextLen);
};

switch (shape.type) {
case 'rect': {
const [topLeft, bottomRight] = await Promise.all([
Expand All @@ -866,20 +880,34 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
}

const pageView = await getPageView(topLeft.pageIndex);
const pdfRect = {
type: 'rect',
left: topLeft.x,
top: topLeft.y,
right: bottomRight.x,
bottom: bottomRight.y,
} as const;

const textLayer = getTextLayerFromPoint(shape.left, shape.top);
let text;
if (textLayer) {
const rect = new DOMRect(
shape.left,
shape.top,
shape.right - shape.left,
shape.bottom - shape.top,
);
text = textFromRect(textLayer, rect);
}

return [
createPageSelector(pageView, topLeft.pageIndex),
{
type: 'ShapeSelector',
anchor: 'page',
shape: {
type: 'rect',
left: topLeft.x,
top: topLeft.y,
right: bottomRight.x,
bottom: bottomRight.y,
},
shape: pdfRect,
view: pageBoundingBox(pageView.pdfPage),
text,
},
];
}
Expand All @@ -889,8 +917,14 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
throw new Error('Point is not in a page');
}

const pageView = await getPageView(point.pageIndex);
const textLayer = getTextLayerFromPoint(shape.x, shape.y);
let text;
if (textLayer) {
const rect = new DOMRect(shape.x, shape.y, 1, 1);
text = textFromRect(textLayer, rect);
}

const pageView = await getPageView(point.pageIndex);
return [
createPageSelector(pageView, point.pageIndex),
{
Expand All @@ -901,6 +935,7 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
x: point.x,
y: point.y,
},
text,
view: pageBoundingBox(pageView.pdfPage),
},
];
Expand Down
90 changes: 87 additions & 3 deletions src/annotator/anchoring/test/pdf-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -964,10 +964,26 @@ describe('annotator/anchoring/pdf', () => {

describe('describeShape', () => {
let elementsFromPoint;
let textLayer;
let fakeTextInDOMRect;

const borderLeft = 5;
const borderTop = 8;

// Create a matcher for a `DOMRect`.
//
// Note that if you pass a `DOMRect` directly to eg. `assert.calledWith`,
// the match will always succeed, whether the values are equal or not.
const matchRect = expected =>
sinon.match(
rect =>
rect.x === expected.x &&
rect.y === expected.y &&
rect.width === expected.width &&
rect.height === expected.height,
`DOMRect { x=${expected.x}, y=${expected.y} width=${expected.width} height=${expected.height} }`,
);

beforeEach(() => {
for (let i = 0; i < viewer.pdfViewer.pagesCount; i++) {
const pageDiv = viewer.pdfViewer.getPageView(i).div;
Expand All @@ -979,6 +995,9 @@ describe('annotator/anchoring/pdf', () => {
// which are not a PDF page container, are ignored.
const dummy = document.createElement('div');

textLayer = document.createElement('div');
textLayer.className = 'textLayer';

// Override `elementsFromPoint` to control how viewport coordinates are
// mapped to pages.
elementsFromPoint = sinon.stub(document, 'elementsFromPoint');
Expand All @@ -994,7 +1013,13 @@ describe('annotator/anchoring/pdf', () => {
}

const pageDiv = viewer.pdfViewer.getPageView(pageIndex).div;
return [dummy, pageDiv];
return [dummy, textLayer, pageDiv];
});

fakeTextInDOMRect = sinon.stub().returns('text-in-shape');

pdfAnchoring.$imports.$mock({
'./text-in-rect': { textInDOMRect: fakeTextInDOMRect },
});
});

Expand Down Expand Up @@ -1024,6 +1049,11 @@ describe('annotator/anchoring/pdf', () => {
y: 10 + borderTop,
});

assert.calledWith(
fakeTextInDOMRect,
textLayer,
matchRect(new DOMRect(10 + borderLeft, 10 + borderTop, 1, 1)),
);
assert.deepEqual(selectors, [
{
type: 'PageSelector',
Expand All @@ -1044,9 +1074,21 @@ describe('annotator/anchoring/pdf', () => {
right: 100,
top: 200,
},
text: 'text-in-shape',
},
]);
});

it('does not extract text if there is no text layer', async () => {
textLayer.className = 'notTheTextLayer';
const selectors = await describeShape({
type: 'point',
x: 10 + borderLeft,
y: 10 + borderTop,
});
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
assert.isUndefined(shapeSelector.text);
});
});

context('when shape is a rect', () => {
Expand Down Expand Up @@ -1090,14 +1132,29 @@ describe('annotator/anchoring/pdf', () => {
const [expectedLeft, expectedTop] = pageView.getPagePoint(10, 10);
const [expectedRight, expectedBottom] = pageView.getPagePoint(30, 50);

const selectors = await describeShape({
type: 'rect',
const rect = {
left: 10 + borderLeft,
top: 10 + borderTop,
right: 30 + borderLeft,
bottom: 50 + borderTop,
};
const selectors = await describeShape({
type: 'rect',
...rect,
});

assert.calledWith(
fakeTextInDOMRect,
textLayer,
matchRect(
new DOMRect(
rect.left,
rect.top,
rect.right - rect.left,
rect.bottom - rect.top,
),
),
);
assert.deepEqual(selectors, [
{
type: 'PageSelector',
Expand All @@ -1120,11 +1177,38 @@ describe('annotator/anchoring/pdf', () => {
right: 100,
top: 200,
},
text: 'text-in-shape',
},
]);
});
});

it('does not extract text if there is no text layer', async () => {
textLayer.className = 'notTheTextLayer';
const selectors = await describeShape({
type: 'rect',
left: 10 + borderLeft,
top: 10 + borderTop,
right: 30 + borderLeft,
bottom: 50 + borderTop,
});
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
assert.isUndefined(shapeSelector.text);
});

it('truncates extracted text', async () => {
fakeTextInDOMRect.returns('a'.repeat(300));
const selectors = await describeShape({
type: 'rect',
left: 10 + borderLeft,
top: 10 + borderTop,
right: 100,
bottom: 100,
});
const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
assert.equal(shapeSelector.text, 'a'.repeat(256));
});

it('throws if shape is unsupported', async () => {
let err;
try {
Expand Down
69 changes: 69 additions & 0 deletions src/annotator/anchoring/test/text-in-rect-test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import { textInDOMRect } from '../text-in-rect';

describe('textInDOMRect', () => {
let container;

beforeEach(() => {
container = document.createElement('div');
container.style.position = 'fixed';

const leftColumn = document.createElement('p');
leftColumn.className = 'left-column';
Object.assign(leftColumn.style, {
position: 'absolute',
width: '200px',
});
leftColumn.append('Line one', document.createElement('br'), 'Line two');

const rightColumn = document.createElement('p');
rightColumn.className = 'right-column';
Object.assign(rightColumn.style, {
position: 'absolute',
width: '200px',
left: '200px',
});
rightColumn.append('Line three', document.createElement('br'), 'Line four');

container.append(leftColumn, rightColumn);

document.body.append(container);
});

afterEach(() => {
container.remove();
});

[
// Rect covering whole left column
{
rect: new DOMRect(0, 0, 200, 200),
expected: 'Line one Line two',
},
// Rect covering whole right column
{
rect: new DOMRect(200, 0, 200, 200),
expected: 'Line three Line four',
},
// Tiny rect touching first word in left column
{
rect: new DOMRect(10, 10, 1, 1),
expected: 'Line',
},
// Zero-sized rect touching first word in left column
{
rect: new DOMRect(10, 10, 0, 0),
expected: '',
},
].forEach(({ rect, expected }) => {
it('returns text in rect', () => {
const text = textInDOMRect(container, rect);
assert.equal(text, expected);
});
});

it('only returns text from root container', () => {
const leftColumn = container.querySelector('.left-column');
const text = textInDOMRect(leftColumn, new DOMRect(0, 0, 500, 500));
assert.equal(text, 'Line one Line two');
});
});
59 changes: 59 additions & 0 deletions src/annotator/anchoring/text-in-rect.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import { rectIntersects, rectsOverlapVertically } from '../util/geometry';

/**
* Return the DOM text that intersects a given rect.
*
* The text nodes under {@link root} are split into words and the bounding
* rectangle of each word is intersected with {@link rect}. If the intersection
* is non-empty, the text of that word is added to the output string.
*
* @param root - Root element of the DOM tree to search
* @param rect - Client coordinates of the region
*/
export function textInDOMRect(root: Element, rect: DOMRect): string {
const iter = root.ownerDocument!.createNodeIterator(
root,
NodeFilter.SHOW_TEXT,
);

// Pieces of text that intersect the rect.
const textChunks = [];

// Rect for previous text chunk which was included in the output.
let prevChunkRect;

let currentNode;
while ((currentNode = iter.nextNode())) {
const textNode = currentNode as Text;

// We split on word boundaries here rather than spaces, so inter-word spaces
// are included in the "words".
const words = textNode.data.split(/\b/);
let offset = 0;

for (const word of words) {
const range = new Range();
range.setStart(textNode, offset);
const endOffset = offset + word.length;
range.setEnd(textNode, endOffset);
const wordRect = range.getBoundingClientRect();

if (rectIntersects(wordRect, rect)) {
// We assume that spaces are included in the text between words on a
// line, but not between lines.
const newLine =
prevChunkRect && !rectsOverlapVertically(prevChunkRect, wordRect);
if (newLine) {
textChunks.push(' ');
}

textChunks.push(word);
prevChunkRect = wordRect;
}

offset = endOffset;
}
}

return textChunks.join('');
}
Loading