feat: add extractTextItems for structured text extraction with positional data

johannschopplich · johannschopplich · commit b2237d143083 · 2026-04-13T14:10:53.000+02:00
diff --git a/src/index.ts b/src/index.ts
@@ -1,11 +1,12 @@
 import { extractImages as _extractImages, renderPageAsImage as _renderPageAsImage } from './image'
 import { extractLinks as _extractLinks } from './links'
 import { getMeta as _getMeta } from './meta'
-import { extractText as _extractText } from './text'
+import { extractText as _extractText, extractTextItems as _extractTextItems } from './text'
 import { resolvePDFJSImport } from './utils'
 
 export { configureUnPDF, definePDFJSModule } from './config'
 export { createIsomorphicCanvasFactory } from './image'
+export type { StructuredTextItem } from './text'
 
 export {
   getDocumentProxy,
@@ -23,6 +24,11 @@ export const extractText: typeof _extractText = async (...args) => {
   return await (_extractText as any)(...args)
 }
 
+export const extractTextItems: typeof _extractTextItems = async (...args) => {
+  await resolvePDFJSImport()
+  return await _extractTextItems(...args)
+}
+
 export const extractImages: typeof _extractImages = async (...args) => {
   await resolvePDFJSImport()
   return await _extractImages(...args)
diff --git a/src/text.ts b/src/text.ts
@@ -1,6 +1,64 @@
-import type { DocumentInitParameters, PDFDocumentProxy, TextItem } from 'pdfjs-dist/types/src/display/api'
+import type { DocumentInitParameters, PDFDocumentProxy, TextItem, TextStyle } from 'pdfjs-dist/types/src/display/api'
 import { getDocumentProxy, isPDFDocumentProxy } from './utils'
 
+export interface StructuredTextItem {
+  /** Text content. */
+  str: string
+  /** X position in PDF coordinate space (origin: bottom-left). */
+  x: number
+  /** Y position in PDF coordinate space (origin: bottom-left). */
+  y: number
+  /** Width in device space. */
+  width: number
+  /** Height in device space. */
+  height: number
+  /** Font size derived from the transformation matrix. */
+  fontSize: number
+  /** Font family name. */
+  fontFamily: string
+  /** Text direction: `"ltr"`, `"rtl"`, or `"ttb"`. */
+  dir: string
+  /** Whether the text item is followed by a line break. */
+  hasEOL: boolean
+}
+
+export async function extractTextItems(
+  data: DocumentInitParameters['data'] | PDFDocumentProxy,
+): Promise<{ totalPages: number, items: StructuredTextItem[][] }> {
+  const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data)
+  const items = await Promise.all(
+    Array.from({ length: pdf.numPages }, (_, i) => getPageTextItems(pdf, i + 1)),
+  )
+
+  return { totalPages: pdf.numPages, items }
+}
+
+async function getPageTextItems(
+  document: PDFDocumentProxy,
+  pageNumber: number,
+): Promise<StructuredTextItem[]> {
+  const page = await document.getPage(pageNumber)
+  const content = await page.getTextContent()
+  const styles = content.styles as Record<string, TextStyle>
+
+  return (content.items as TextItem[])
+    .filter(item => item.str != null)
+    .map((item) => {
+      const [_a, _b, c, d, e, f] = item.transform
+      return {
+        str: item.str,
+        x: e,
+        y: f,
+        width: item.width,
+        height: item.height,
+        fontSize: Math.hypot(c, d),
+        fontFamily: styles[item.fontName]?.fontFamily ?? '',
+        dir: item.dir,
+        hasEOL: item.hasEOL,
+      }
+    })
+}
+
 export function extractText(
   data: DocumentInitParameters['data'] | PDFDocumentProxy,
   options?: { mergePages?: false },
diff --git a/test/index.test.ts b/test/index.test.ts
@@ -8,6 +8,7 @@ import {
   extractImages,
   extractLinks,
   extractText,
+  extractTextItems,
   getDocumentProxy,
   getMeta,
   getResolvedPDFJS,
@@ -61,6 +62,29 @@ describe('unpdf', () => {
     expect(totalPages).toMatchInlineSnapshot('1')
   })
 
+  it('extracts structured text items from a PDF', async () => {
+    const { items, totalPages } = await extractTextItems(await getPDF())
+
+    expect(totalPages).toBe(1)
+    expect(items).toHaveLength(1)
+    expect(items[0]!.length).toBeGreaterThan(0)
+
+    const firstItem = items[0]![0]!
+    expect(firstItem).toMatchInlineSnapshot(`
+      {
+        "dir": "ltr",
+        "fontFamily": "sans-serif",
+        "fontSize": 16.1,
+        "hasEOL": false,
+        "height": 16.1,
+        "str": "Dummy PDF file",
+        "width": 123.41130000000003,
+        "x": 56.8,
+        "y": 758.1,
+      }
+    `)
+  })
+
   it('extracts links from a PDF', async () => {
     const { links, totalPages } = await extractLinks(await getPDF('links.pdf'))
     expect(links.length).toMatchInlineSnapshot('4')