Skip to content

Commit 292c642

Browse files
BookShelfScrape v1.1
1 parent 954f0f2 commit 292c642

1 file changed

Lines changed: 80 additions & 0 deletions

File tree

BookShelfScrape v1.1

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#! Python
2+
# Goes to bookshelf and downloads blue box questions from the CFA textbooks
3+
4+
import requests, json, re, os
5+
from bs4 import BeautifulSoup
6+
7+
username = input('Please type in your E-mail: ')
8+
password = input('Please type in your password: ')
9+
10+
# Provide username and password
11+
payload = {
12+
'user[email]': username,
13+
'user[password]': password
14+
}
15+
16+
# Specifying year and level of books
17+
Year = input('What year of books do you want?: ')
18+
Level = input('What Level? (I,II,III): ')
19+
20+
# Creating a folder for the documents
21+
startingLocation = os.path.abspath(os.curdir)
22+
location = startingLocation + '\CFABlueBoxes'
23+
if not os.path.exists(location):
24+
os.makedirs(location)
25+
os.chdir(location)
26+
print('ALL FILES WILL BE SAVED IN: %s' % location)
27+
28+
# Logging in and keeping a session open
29+
with requests.Session() as s:
30+
p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
31+
Library = p.text
32+
parsed_Library = json.loads(Library)['books']
33+
my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
34+
isbnList = []
35+
# grabbing all the books to read
36+
for book in parsed_Library:
37+
title = book['title']
38+
isbn = book['isbn']
39+
if my_regex.search(title):
40+
isbnList.append({'title':title, 'isbn':int(isbn)})
41+
# Going to each book's table of contents first
42+
for book in isbnList:
43+
bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
44+
tableOfContents = s.get(bookUrl).text
45+
parsed_tableOfContents = json.loads(tableOfContents)
46+
readingList = []
47+
readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
48+
newLocation = startingLocation + '\CFABlueBoxes' + r'\ ' + book['title']
49+
if not os.path.exists(newLocation):
50+
os.makedirs(newLocation)
51+
os.chdir(newLocation)
52+
# Grabbing the links to all of the readings
53+
for reading in parsed_tableOfContents:
54+
if readingRegex.search(reading['title']):
55+
readingList.append(reading)
56+
for subReading in readingList:
57+
newLink = subReading['path']
58+
newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
59+
reading = s.get(newURL).text
60+
soupReading = BeautifulSoup(reading, "html.parser")
61+
for span_tag in soupReading.findAll('span'):
62+
span_tag.replace_with('')
63+
title = subReading['title']
64+
#title = " ".join(re.findall("[a-zA-Z]+", title))
65+
print('Now Scraping Reading: %s' % title)
66+
# Grabbing all of the blue boxes which are denoted by "figure" in the HTML
67+
# Trying to also save them off as word documents, with both text and tables
68+
figures = soupReading.findAll("figure", {"class": "example"})
69+
if len(figures) >= 1:
70+
readingFile = open('%s.html' % title, 'wb')
71+
for figure in figures:
72+
for image_tag in figure.findAll('img'):
73+
image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + image_tag['src']
74+
readingFile.write(figure.encode('UTF-8'))
75+
readingFile.close()
76+
else:
77+
continue
78+
print('Done! Enjoy!')
79+
80+

0 commit comments

Comments
 (0)