-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathBookShelfScrape.py
More file actions
79 lines (71 loc) · 3.31 KB
/
BookShelfScrape.py
File metadata and controls
79 lines (71 loc) · 3.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#! Python
# Goes to bookshelf and downloads blue box questions from the CFA textbooks
import requests, json, re, os
from bs4 import BeautifulSoup
from docx import Document
username = input('Please type in your E-mail: ')
password = input('Please type in your password: ')
# Provide username and password
payload = {
'user[email]': username,
'user[password]': password
}
# Specifying year and level of books
Year = input('What year of books do you want?: ')
Level = input('What Level? (I,II,III): ')
# Creating a folder for the documents
startingLocation = os.path.abspath(os.curdir)
location = startingLocation + '\CFABlueBoxes'
if not os.path.exists(location):
os.makedirs(location)
os.chdir(location)
# Logging in and keeping a session open
with requests.Session() as s:
p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
Library = p.text
parsed_Library = json.loads(Library)['books']
my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
isbnList = []
# grabbing all the books to read
for book in parsed_Library:
title = book['title']
isbn = book['isbn']
if my_regex.search(title):
isbnList.append({'title':title, 'isbn':int(isbn)})
# Going to each book's table of contents first
for book in isbnList:
bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/cfi/6/8!'
tableOfContents = s.get(bookUrl).text
soup = BeautifulSoup(tableOfContents, "html.parser")
tags = soup.findAll('li')
newLocation = startingLocation + '\CFABlueBoxes' + r'\ ' + book['title']
if not os.path.exists(newLocation):
os.makedirs(newLocation)
os.chdir(newLocation)
# Grabbing the links to all of the readings
for tag in tags:
links = tag.findAll('a')
for link in links:
readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
text = link.text
if readingRegex.search(text):
newLink = link['href']
newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + str(newLink)
reading = s.get(newURL).text
soupReading = BeautifulSoup(reading, "html.parser")
for span_tag in soupReading.findAll('span'):
span_tag.replace_with('')
title = soupReading.find('title').text
title = " ".join(re.findall("[a-zA-Z]+", title))
# Grabbing all of the blue boxes which are denoted by "figure" in the HTML
# Trying to also save them off as word documents, with both text and tables
figures = soupReading.findAll("figure", {"class": "example"})
if len(figures) >= 1:
readingFile = open('%s.html' % title, 'wb')
for figure in figures:
for image_tag in figure.findAll('img'):
image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + image_tag['src']
readingFile.write(figure.encode('UTF-8'))
readingFile.close()
else:
continue