|
1 | | -import type { DocumentInitParameters, PDFDocumentProxy, TextItem } from 'pdfjs-dist/types/src/display/api' |
| 1 | +import type { DocumentInitParameters, PDFDocumentProxy, TextItem, TextStyle } from 'pdfjs-dist/types/src/display/api' |
2 | 2 | import { getDocumentProxy, isPDFDocumentProxy } from './utils' |
3 | 3 |
|
| 4 | +export interface StructuredTextItem { |
| 5 | + /** Text content. */ |
| 6 | + str: string |
| 7 | + /** X position in PDF coordinate space (origin: bottom-left). */ |
| 8 | + x: number |
| 9 | + /** Y position in PDF coordinate space (origin: bottom-left). */ |
| 10 | + y: number |
| 11 | + /** Width in device space. */ |
| 12 | + width: number |
| 13 | + /** Height in device space. */ |
| 14 | + height: number |
| 15 | + /** Font size derived from the transformation matrix. */ |
| 16 | + fontSize: number |
| 17 | + /** Font family name. */ |
| 18 | + fontFamily: string |
| 19 | + /** Text direction: `"ltr"`, `"rtl"`, or `"ttb"`. */ |
| 20 | + dir: string |
| 21 | + /** Whether the text item is followed by a line break. */ |
| 22 | + hasEOL: boolean |
| 23 | +} |
| 24 | + |
| 25 | +export async function extractTextItems( |
| 26 | + data: DocumentInitParameters['data'] | PDFDocumentProxy, |
| 27 | +): Promise<{ totalPages: number, items: StructuredTextItem[][] }> { |
| 28 | + const pdf = isPDFDocumentProxy(data) ? data : await getDocumentProxy(data) |
| 29 | + const items = await Promise.all( |
| 30 | + Array.from({ length: pdf.numPages }, (_, i) => getPageTextItems(pdf, i + 1)), |
| 31 | + ) |
| 32 | + |
| 33 | + return { totalPages: pdf.numPages, items } |
| 34 | +} |
| 35 | + |
| 36 | +async function getPageTextItems( |
| 37 | + document: PDFDocumentProxy, |
| 38 | + pageNumber: number, |
| 39 | +): Promise<StructuredTextItem[]> { |
| 40 | + const page = await document.getPage(pageNumber) |
| 41 | + const content = await page.getTextContent() |
| 42 | + const styles = content.styles as Record<string, TextStyle> |
| 43 | + |
| 44 | + return (content.items as TextItem[]) |
| 45 | + .filter(item => item.str != null) |
| 46 | + .map((item) => { |
| 47 | + const [_a, _b, c, d, e, f] = item.transform |
| 48 | + return { |
| 49 | + str: item.str, |
| 50 | + x: e, |
| 51 | + y: f, |
| 52 | + width: item.width, |
| 53 | + height: item.height, |
| 54 | + fontSize: Math.hypot(c, d), |
| 55 | + fontFamily: styles[item.fontName]?.fontFamily ?? '', |
| 56 | + dir: item.dir, |
| 57 | + hasEOL: item.hasEOL, |
| 58 | + } |
| 59 | + }) |
| 60 | +} |
| 61 | + |
4 | 62 | export function extractText( |
5 | 63 | data: DocumentInitParameters['data'] | PDFDocumentProxy, |
6 | 64 | options?: { mergePages?: false }, |
|
0 commit comments