-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdfbox
More file actions
132 lines (117 loc) · 4.54 KB
/
pdfbox
File metadata and controls
132 lines (117 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
# coding: utf-8
# @Time : pdfbox.py
# @Author : Derek
# @File : 2018/1/23 下午3:43
import hashlib
import os
import requests
import appdirs
import sarge
pdfbox_version = '2.0.8'
pdfbox_url = 'https://www.apache.org/dist/pdfbox/{version}/pdfbox-app-{version}.jar'.format(version=pdfbox_version)
md5_url = 'https://www.apache.org/dist/pdfbox/{version}/pdfbox-app-{version}.jar.md5'.format(version=pdfbox_version)
class PDFBox(object):
"""
Python interface to Apache PDFBox.
Methods
-------
extract_text(input_path, output_path='',
password=None, encoding=None, html=False, sort=False,
ignore_beads=False, force=False, start_page=1, end_page=None)
Extract all text from PDF file.
"""
def which(self,name):
for path in os.getenv("PATH").split(os.path.pathsep):
full_path = path + os.sep + name
if os.path.exists(full_path):
return full_path
def _verify_md5(self, data, digest):
"""
Verify MD5 checksum.
"""
return hashlib.md5(data).hexdigest() == digest
def _get_pdfbox_path(self):
"""
Return path to local copy of PDFBox jar file.
"""
# Use PDFBOX environmental variable if it exists:
if 'PDFBOX' in os.environ:
pdfbox_path = os.environ['PDFBOX']
if not os.path.exists(pdfbox_path):
raise RuntimeError('pdfbox not found')
return pdfbox_path
# Use platform-specific cache directory:
a = appdirs.AppDirs('python-pdfbox')
cache_dir = a.user_cache_dir
pdfbox_path = os.path.join(cache_dir, os.path.basename(pdfbox_url))
# Retrieve, cache, and verify PDFBox jar file:
if not os.path.exists(pdfbox_path):
if not os.path.isdir(cache_dir):
os.mkdir(cache_dir)
r=requests.get(pdfbox_url)
try:
data = r.content
except:
raise RuntimeError('error retrieving %s' % os.path.basename(pdfbox_url))
else:
with open(pdfbox_path, 'wb') as f:
f.write(data)
print pdfbox_path
return pdfbox_path
def __init__(self):
self.pdfbox_path = self._get_pdfbox_path()
self.java_path = self.which('java')
if not self.java_path:
raise RuntimeError('java not found')
def extract_text(self, input_path, output_path='',
password=None, encoding=None, html=False, sort=False,
ignore_beads=False, force=False, start_page=1, end_page=None):
"""
Extract all text from PDF file.
Parameters
----------
input_path : str
Input PDF file.
output_path : str
Output text file. If not specified, the extracted text is returned.
password : str
PDF password.
encoding : str
Text file encoding.
html : bool
If True, extract as HTML.
sort : bool
If True, sort text before returning it.
ignore_beads : bool
If True, ignore separation by beads.
force : bool
If True, ignore corrupt objects.
start_page : int
First page to extract (starting with 1).
end_page : int
Last page to extract (starting with 1).
Returns
-------
text : str
Extracted text. If `output_path` is not specified, nothing is returned.
"""
options = (' -password {password}'.format(password=password) if password else '') + \
(' -encoding {encoding}'.format(encoding=encoding) if encoding else '') + \
(' -html' if html else '') + \
(' -sort' if sort else '') + \
(' -ignoreBeads' if ignore_beads else '') + \
(' -force' if force else '') + \
(' -startPage {start_page}'.format(start_page=start_page) if start_page else '') + \
(' -endPage {end_page}'.format(end_page=end_page) if end_page else '')
if not output_path:
options += ' -console'
cmd = '{java_path} -jar {pdfbox_path} ExtractText {options} {input_path} {output_path}'.format(
java_path=self.java_path,
pdfbox_path=self.pdfbox_path,
options=options,
input_path=input_path,
output_path=output_path)
p = sarge.capture_stdout(cmd)
if not output_path:
return p.stdout.text