hypothesis · robertknight · May 14, 2025 · May 14, 2025
diff --git a/src/annotator/anchoring/pdf.ts b/src/annotator/anchoring/pdf.ts
@@ -17,6 +17,7 @@ import type {
 import { translateOffsets } from '../util/normalize';
 import { matchQuote } from './match-quote';
 import { createPlaceholder } from './placeholder';
+import { textInDOMRect } from './text-in-rect';
 import { TextPosition, TextRange } from './text-range';
 import { TextQuoteAnchor } from './types';
 
@@ -128,6 +129,12 @@ async function getPageView(pageIndex: number): Promise<PDFPageView> {
   return pageView!;
 }
 
+function getTextLayerFromPoint(x: number, y: number): HTMLElement | undefined {
+  return document
+    .elementsFromPoint(x, y)
+    .find(el => el.classList.contains('textLayer')) as HTMLElement | undefined;
+}
+
 /**
  * Return true if the document has selectable text.
  */
@@ -848,6 +855,13 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
     };
   };
 
+  const textFromRect = (textLayer: HTMLElement, rect: DOMRect) => {
+    // Set a limit on how much text is included in thumbnails, to avoid shape
+    // selector objects becoming too large.
+    const maxTextLen = 256;
+    return textInDOMRect(textLayer, rect).slice(0, maxTextLen);
+  };
+
   switch (shape.type) {
     case 'rect': {
       const [topLeft, bottomRight] = await Promise.all([
@@ -866,20 +880,34 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
       }
 
       const pageView = await getPageView(topLeft.pageIndex);
+      const pdfRect = {
+        type: 'rect',
+        left: topLeft.x,
+        top: topLeft.y,
+        right: bottomRight.x,
+        bottom: bottomRight.y,
+      } as const;
+
+      const textLayer = getTextLayerFromPoint(shape.left, shape.top);
+      let text;
+      if (textLayer) {
+        const rect = new DOMRect(
+          shape.left,
+          shape.top,
+          shape.right - shape.left,
+          shape.bottom - shape.top,
+        );
+        text = textFromRect(textLayer, rect);
+      }
 
       return [
         createPageSelector(pageView, topLeft.pageIndex),
         {
           type: 'ShapeSelector',
           anchor: 'page',
-          shape: {
-            type: 'rect',
-            left: topLeft.x,
-            top: topLeft.y,
-            right: bottomRight.x,
-            bottom: bottomRight.y,
-          },
+          shape: pdfRect,
           view: pageBoundingBox(pageView.pdfPage),
+          text,
         },
       ];
     }
@@ -889,8 +917,14 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
         throw new Error('Point is not in a page');
       }
 
-      const pageView = await getPageView(point.pageIndex);
+      const textLayer = getTextLayerFromPoint(shape.x, shape.y);
+      let text;
+      if (textLayer) {
+        const rect = new DOMRect(shape.x, shape.y, 1, 1);
+        text = textFromRect(textLayer, rect);
+      }
 
+      const pageView = await getPageView(point.pageIndex);
       return [
         createPageSelector(pageView, point.pageIndex),
         {
@@ -901,6 +935,7 @@ export async function describeShape(shape: Shape): Promise<Selector[]> {
             x: point.x,
             y: point.y,
           },
+          text,
           view: pageBoundingBox(pageView.pdfPage),
         },
       ];

diff --git a/src/annotator/anchoring/test/pdf-test.js b/src/annotator/anchoring/test/pdf-test.js
@@ -964,10 +964,26 @@ describe('annotator/anchoring/pdf', () => {
 
   describe('describeShape', () => {
     let elementsFromPoint;
+    let textLayer;
+    let fakeTextInDOMRect;
 
     const borderLeft = 5;
     const borderTop = 8;
 
+    // Create a matcher for a `DOMRect`.
+    //
+    // Note that if you pass a `DOMRect` directly to eg. `assert.calledWith`,
+    // the match will always succeed, whether the values are equal or not.
+    const matchRect = expected =>
+      sinon.match(
+        rect =>
+          rect.x === expected.x &&
+          rect.y === expected.y &&
+          rect.width === expected.width &&
+          rect.height === expected.height,
+        `DOMRect { x=${expected.x}, y=${expected.y} width=${expected.width} height=${expected.height} }`,
+      );
+
     beforeEach(() => {
       for (let i = 0; i < viewer.pdfViewer.pagesCount; i++) {
         const pageDiv = viewer.pdfViewer.getPageView(i).div;
@@ -979,6 +995,9 @@ describe('annotator/anchoring/pdf', () => {
       // which are not a PDF page container, are ignored.
       const dummy = document.createElement('div');
 
+      textLayer = document.createElement('div');
+      textLayer.className = 'textLayer';
+
       // Override `elementsFromPoint` to control how viewport coordinates are
       // mapped to pages.
       elementsFromPoint = sinon.stub(document, 'elementsFromPoint');
@@ -994,7 +1013,13 @@ describe('annotator/anchoring/pdf', () => {
         }
 
         const pageDiv = viewer.pdfViewer.getPageView(pageIndex).div;
-        return [dummy, pageDiv];
+        return [dummy, textLayer, pageDiv];
+      });
+
+      fakeTextInDOMRect = sinon.stub().returns('text-in-shape');
+
+      pdfAnchoring.$imports.$mock({
+        './text-in-rect': { textInDOMRect: fakeTextInDOMRect },
       });
     });
 
@@ -1024,6 +1049,11 @@ describe('annotator/anchoring/pdf', () => {
           y: 10 + borderTop,
         });
 
+        assert.calledWith(
+          fakeTextInDOMRect,
+          textLayer,
+          matchRect(new DOMRect(10 + borderLeft, 10 + borderTop, 1, 1)),
+        );
         assert.deepEqual(selectors, [
           {
             type: 'PageSelector',
@@ -1044,9 +1074,21 @@ describe('annotator/anchoring/pdf', () => {
               right: 100,
               top: 200,
             },
+            text: 'text-in-shape',
           },
         ]);
       });
+
+      it('does not extract text if there is no text layer', async () => {
+        textLayer.className = 'notTheTextLayer';
+        const selectors = await describeShape({
+          type: 'point',
+          x: 10 + borderLeft,
+          y: 10 + borderTop,
+        });
+        const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
+        assert.isUndefined(shapeSelector.text);
+      });
     });
 
     context('when shape is a rect', () => {
@@ -1090,14 +1132,29 @@ describe('annotator/anchoring/pdf', () => {
         const [expectedLeft, expectedTop] = pageView.getPagePoint(10, 10);
         const [expectedRight, expectedBottom] = pageView.getPagePoint(30, 50);
 
-        const selectors = await describeShape({
-          type: 'rect',
+        const rect = {
           left: 10 + borderLeft,
           top: 10 + borderTop,
           right: 30 + borderLeft,
           bottom: 50 + borderTop,
+        };
+        const selectors = await describeShape({
+          type: 'rect',
+          ...rect,
         });
 
+        assert.calledWith(
+          fakeTextInDOMRect,
+          textLayer,
+          matchRect(
+            new DOMRect(
+              rect.left,
+              rect.top,
+              rect.right - rect.left,
+              rect.bottom - rect.top,
+            ),
+          ),
+        );
         assert.deepEqual(selectors, [
           {
             type: 'PageSelector',
@@ -1120,11 +1177,38 @@ describe('annotator/anchoring/pdf', () => {
               right: 100,
               top: 200,
             },
+            text: 'text-in-shape',
           },
         ]);
       });
     });
 
+    it('does not extract text if there is no text layer', async () => {
+      textLayer.className = 'notTheTextLayer';
+      const selectors = await describeShape({
+        type: 'rect',
+        left: 10 + borderLeft,
+        top: 10 + borderTop,
+        right: 30 + borderLeft,
+        bottom: 50 + borderTop,
+      });
+      const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
+      assert.isUndefined(shapeSelector.text);
+    });
+
+    it('truncates extracted text', async () => {
+      fakeTextInDOMRect.returns('a'.repeat(300));
+      const selectors = await describeShape({
+        type: 'rect',
+        left: 10 + borderLeft,
+        top: 10 + borderTop,
+        right: 100,
+        bottom: 100,
+      });
+      const shapeSelector = selectors.find(s => s.type === 'ShapeSelector');
+      assert.equal(shapeSelector.text, 'a'.repeat(256));
+    });
+
     it('throws if shape is unsupported', async () => {
       let err;
       try {

diff --git a/src/annotator/anchoring/test/text-in-rect-test.js b/src/annotator/anchoring/test/text-in-rect-test.js
@@ -0,0 +1,69 @@
+import { textInDOMRect } from '../text-in-rect';
+
+describe('textInDOMRect', () => {
+  let container;
+
+  beforeEach(() => {
+    container = document.createElement('div');
+    container.style.position = 'fixed';
+
+    const leftColumn = document.createElement('p');
+    leftColumn.className = 'left-column';
+    Object.assign(leftColumn.style, {
+      position: 'absolute',
+      width: '200px',
+    });
+    leftColumn.append('Line one', document.createElement('br'), 'Line two');
+
+    const rightColumn = document.createElement('p');
+    rightColumn.className = 'right-column';
+    Object.assign(rightColumn.style, {
+      position: 'absolute',
+      width: '200px',
+      left: '200px',
+    });
+    rightColumn.append('Line three', document.createElement('br'), 'Line four');
+
+    container.append(leftColumn, rightColumn);
+
+    document.body.append(container);
+  });
+
+  afterEach(() => {
+    container.remove();
+  });
+
+  [
+    // Rect covering whole left column
+    {
+      rect: new DOMRect(0, 0, 200, 200),
+      expected: 'Line one Line two',
+    },
+    // Rect covering whole right column
+    {
+      rect: new DOMRect(200, 0, 200, 200),
+      expected: 'Line three Line four',
+    },
+    // Tiny rect touching first word in left column
+    {
+      rect: new DOMRect(10, 10, 1, 1),
+      expected: 'Line',
+    },
+    // Zero-sized rect touching first word in left column
+    {
+      rect: new DOMRect(10, 10, 0, 0),
+      expected: '',
+    },
+  ].forEach(({ rect, expected }) => {
+    it('returns text in rect', () => {
+      const text = textInDOMRect(container, rect);
+      assert.equal(text, expected);
+    });
+  });
+
+  it('only returns text from root container', () => {
+    const leftColumn = container.querySelector('.left-column');
+    const text = textInDOMRect(leftColumn, new DOMRect(0, 0, 500, 500));
+    assert.equal(text, 'Line one Line two');
+  });
+});
diff --git a/src/annotator/anchoring/text-in-rect.ts b/src/annotator/anchoring/text-in-rect.ts
@@ -0,0 +1,59 @@
+import { rectIntersects, rectsOverlapVertically } from '../util/geometry';
+
+/**
+ * Return the DOM text that intersects a given rect.
+ *
+ * The text nodes under {@link root} are split into words and the bounding
+ * rectangle of each word is intersected with {@link rect}. If the intersection
+ * is non-empty, the text of that word is added to the output string.
+ *
+ * @param root - Root element of the DOM tree to search
+ * @param rect - Client coordinates of the region
+ */
+export function textInDOMRect(root: Element, rect: DOMRect): string {
+  const iter = root.ownerDocument!.createNodeIterator(
+    root,
+    NodeFilter.SHOW_TEXT,
+  );
+
+  // Pieces of text that intersect the rect.
+  const textChunks = [];
+
+  // Rect for previous text chunk which was included in the output.
+  let prevChunkRect;
+
+  let currentNode;
+  while ((currentNode = iter.nextNode())) {
+    const textNode = currentNode as Text;
+
+    // We split on word boundaries here rather than spaces, so inter-word spaces
+    // are included in the "words".
+    const words = textNode.data.split(/\b/);
+    let offset = 0;
+
+    for (const word of words) {
+      const range = new Range();
+      range.setStart(textNode, offset);
+      const endOffset = offset + word.length;
+      range.setEnd(textNode, endOffset);
+      const wordRect = range.getBoundingClientRect();
+
+      if (rectIntersects(wordRect, rect)) {
+        // We assume that spaces are included in the text between words on a
+        // line, but not between lines.
+        const newLine =
+          prevChunkRect && !rectsOverlapVertically(prevChunkRect, wordRect);
+        if (newLine) {
+          textChunks.push(' ');
+        }
+
+        textChunks.push(word);
+        prevChunkRect = wordRect;
+      }
+
+      offset = endOffset;
+    }
+  }
+
+  return textChunks.join('');
+}