Skip to content

Commit 292b6ef

Browse files
committed
The BookShelf Scraper
It downloads your CFA stuff!
1 parent 7c928ca commit 292b6ef

1 file changed

Lines changed: 79 additions & 0 deletions

File tree

BookShelfScrape.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#! Python
2+
# Goes to bookshelf and downloads blue box questions from the CFA textbooks
3+
4+
import requests, json, re, pandas, os
5+
from bs4 import BeautifulSoup
6+
from docx import Document
7+
8+
username = input('Please type in your E-mail: ')
9+
password = input('Please type in your password: ')
10+
11+
# Provide username and password
12+
payload = {
13+
'user[email]': username,
14+
'user[password]': password
15+
}
16+
17+
# Specifying year and level of books
18+
Year = input('What year of books do you want?: ')
19+
Level = input('What Level? (I,II,III): ')
20+
21+
# Creating a folder for the documents
22+
startingLocation = os.path.abspath(os.curdir)
23+
location = startingLocation + '\CFABlueBoxes'
24+
if not os.path.exists(location):
25+
os.makedirs(location)
26+
os.chdir(location)
27+
28+
# Logging in and keeping a session open
29+
with requests.Session() as s:
30+
p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
31+
Library = p.text
32+
parsed_Library = json.loads(Library)['books']
33+
my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
34+
isbnList = []
35+
# grabbing all the books to read
36+
for book in parsed_Library:
37+
title = book['title']
38+
isbn = book['isbn']
39+
if my_regex.search(title):
40+
isbnList.append({'title':title, 'isbn':int(isbn)})
41+
# Going to each book's table of contents first
42+
for book in isbnList:
43+
bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/cfi/6/8!'
44+
tableOfContents = s.get(bookUrl).text
45+
soup = BeautifulSoup(tableOfContents, "html.parser")
46+
tags = soup.findAll('li')
47+
newLocation = startingLocation + '\CFABlueBoxes' + r'\ ' + book['title']
48+
if not os.path.exists(newLocation):
49+
os.makedirs(newLocation)
50+
os.chdir(newLocation)
51+
# Grabbing the links to all of the readings
52+
for tag in tags:
53+
links = tag.findAll('a')
54+
for link in links:
55+
readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
56+
text = link.text
57+
if readingRegex.search(text):
58+
newLink = link['href']
59+
newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + str(newLink)
60+
reading = s.get(newURL).text
61+
soupReading = BeautifulSoup(reading, "html.parser")
62+
for span_tag in soupReading.findAll('span'):
63+
span_tag.replace_with('')
64+
title = soupReading.find('title').text
65+
title = " ".join(re.findall("[a-zA-Z]+", title))
66+
# Grabbing all of the blue boxes which are denoted by "figure" in the HTML
67+
# Trying to also save them off as word documents, with both text and tables
68+
figures = soupReading.findAll("figure", {"class": "example"})
69+
if len(figures) >= 1:
70+
readingFile = open('%s.html' % title, 'wb')
71+
for figure in figures:
72+
for image_tag in figure.findAll('img'):
73+
image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + image_tag['src']
74+
readingFile.write(figure.encode('UTF-8'))
75+
readingFile.close()
76+
else:
77+
continue
78+
79+

0 commit comments

Comments
 (0)