Skip to content

Commit b2237d1

Browse files
feat: add extractTextItems for structured text extraction with positional data
1 parent 4573b27 commit b2237d1

3 files changed

Lines changed: 90 additions & 2 deletions

File tree

src/index.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import { extractImages as _extractImages, renderPageAsImage as _renderPageAsImage } from './image'
22
import { extractLinks as _extractLinks } from './links'
33
import { getMeta as _getMeta } from './meta'
4-
import { extractText as _extractText } from './text'
4+
import { extractText as _extractText, extractTextItems as _extractTextItems } from './text'
55
import { resolvePDFJSImport } from './utils'
66

77
export { configureUnPDF, definePDFJSModule } from './config'
88
export { createIsomorphicCanvasFactory } from './image'
9+
export type { StructuredTextItem } from './text'
910

1011
export {
1112
getDocumentProxy,
@@ -23,6 +24,11 @@ export const extractText: typeof _extractText = async (...args) => {
2324
return await (_extractText as any)(...args)
2425
}
2526

27+
export const extractTextItems: typeof _extractTextItems = async (...args) => {
28+
await resolvePDFJSImport()
29+
return await _extractTextItems(...args)
30+
}
31+
2632
export const extractImages: typeof _extractImages = async (...args) => {
2733
await resolvePDFJSImport()
2834
return await _extractImages(...args)

src/text.ts

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,64 @@
1-
import type { DocumentInitParameters, PDFDocumentProxy, TextItem } from 'pdfjs-dist/types/src/display/api'
1+
import type { DocumentInitParameters, PDFDocumentProxy, TextItem, TextStyle } from 'pdfjs-dist/types/src/display/api'
22
import { getDocumentProxy, isPDFDocumentProxy } from './utils'
33

4+
export interface StructuredTextItem {
5+
/** Text content. */
6+
str: string
7+
/** X position in PDF coordinate space (origin: bottom-left). */
8+
x: number
9+
/** Y position in PDF coordinate space (origin: bottom-left). */
10+
y: number
11+
/** Width in device space. */
12+
width: number
13+
/** Height in device space. */
14+
height: number
15+
/** Font size derived from the transformation matrix. */
16+
fontSize: number
17+
/** Font family name. */
18+
fontFamily: string
19+
/** Text direction: `"ltr"`, `"rtl"`, or `"ttb"`. */
20+
dir: string
21+
/** Whether the text item is followed by a line break. */
22+
hasEOL: boolean
23+
}
24+
25+
export async function extractTextItems(
26+
data: DocumentInitParameters['data'] | PDFDocumentProxy,
27+
): Promise<{ totalPages: number, items: StructuredTextItem[][] }> {
28+
const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data)
29+
const items = await Promise.all(
30+
Array.from({ length: pdf.numPages }, (_, i) => getPageTextItems(pdf, i + 1)),
31+
)
32+
33+
return { totalPages: pdf.numPages, items }
34+
}
35+
36+
async function getPageTextItems(
37+
document: PDFDocumentProxy,
38+
pageNumber: number,
39+
): Promise<StructuredTextItem[]> {
40+
const page = await document.getPage(pageNumber)
41+
const content = await page.getTextContent()
42+
const styles = content.styles as Record<string, TextStyle>
43+
44+
return (content.items as TextItem[])
45+
.filter(item => item.str != null)
46+
.map((item) => {
47+
const [_a, _b, c, d, e, f] = item.transform
48+
return {
49+
str: item.str,
50+
x: e,
51+
y: f,
52+
width: item.width,
53+
height: item.height,
54+
fontSize: Math.hypot(c, d),
55+
fontFamily: styles[item.fontName]?.fontFamily ?? '',
56+
dir: item.dir,
57+
hasEOL: item.hasEOL,
58+
}
59+
})
60+
}
61+
462
export function extractText(
563
data: DocumentInitParameters['data'] | PDFDocumentProxy,
664
options?: { mergePages?: false },

test/index.test.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import {
88
extractImages,
99
extractLinks,
1010
extractText,
11+
extractTextItems,
1112
getDocumentProxy,
1213
getMeta,
1314
getResolvedPDFJS,
@@ -61,6 +62,29 @@ describe('unpdf', () => {
6162
expect(totalPages).toMatchInlineSnapshot('1')
6263
})
6364

65+
it('extracts structured text items from a PDF', async () => {
66+
const { items, totalPages } = await extractTextItems(await getPDF())
67+
68+
expect(totalPages).toBe(1)
69+
expect(items).toHaveLength(1)
70+
expect(items[0]!.length).toBeGreaterThan(0)
71+
72+
const firstItem = items[0]![0]!
73+
expect(firstItem).toMatchInlineSnapshot(`
74+
{
75+
"dir": "ltr",
76+
"fontFamily": "sans-serif",
77+
"fontSize": 16.1,
78+
"hasEOL": false,
79+
"height": 16.1,
80+
"str": "Dummy PDF file",
81+
"width": 123.41130000000003,
82+
"x": 56.8,
83+
"y": 758.1,
84+
}
85+
`)
86+
})
87+
6488
it('extracts links from a PDF', async () => {
6589
const { links, totalPages } = await extractLinks(await getPDF('links.pdf'))
6690
expect(links.length).toMatchInlineSnapshot('4')

0 commit comments

Comments
 (0)