Skip to content

Commit 2038260

Browse files
author
Parita Pooj
authored
Merge pull request parita#6 from sks147/master
Added new random subroutine which can extract text from ms-word file
2 parents d917c5b + be69557 commit 2038260

1 file changed

Lines changed: 20 additions & 0 deletions

File tree

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#You can extract the text content from each Microsoft Word document in a directory
2+
#tree on Windows into a corresponding text file.
3+
#With the PyWin32 extension,we can access Word itself,through COM,to perform
4+
#the conversion:
5+
import fnmatch, os, sys, win32com.client
6+
wordapp = win32com.client.gencache.EnsureDispatch("Word.Application")
7+
try:
8+
for path, dirs, files in os.walk(sys.argv[1]):
9+
for filename in files:
10+
if not fnmatch.fnmatch(filename, '*.doc'): continue
11+
doc = os.path.abspath(os.path.join(path, filename))
12+
print "processing %s" % doc
13+
wordapp.Documents.Open(doc)
14+
docastxt = doc[:-3] + 'txt'
15+
wordapp.ActiveDocument.SaveAs(docastxt,
16+
FileFormat=win32com.client.constants.wdFormatText)
17+
wordapp.ActiveDocument.Close( )
18+
finally:
19+
# ensure Word is properly shut down even if we get an exception
20+
wordapp.Quit( )

0 commit comments

Comments
 (0)