forked from mwcareen/BadPythonScriptsToShare
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathBookShelfScrape v1.2
More file actions
79 lines (71 loc) · 3.22 KB
/
BookShelfScrape v1.2
File metadata and controls
79 lines (71 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#! Python
# Goes to bookshelf and downloads blue box questions from the CFA textbooks
import requests, json, re, os
from bs4 import BeautifulSoup
username = input('Please type in your E-mail: ')
password = input('Please type in your password: ')
# Provide username and password
payload = {
'user[email]': username,
'user[password]': password
}
# Specifying year and level of books
Year = input('What year of books do you want?: ')
Level = input('What Level? (I,II,III): ')
# Creating a folder for the documents
startingLocation = os.path.abspath(os.curdir)
location = startingLocation + '\CFABlueBoxes'
if not os.path.exists(location):
os.makedirs(location)
os.chdir(location)
print('ALL FILES WILL BE SAVED IN: %s' % location)
# Logging in and keeping a session open
with requests.Session() as s:
p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
Library = p.text
parsed_Library = json.loads(Library)['books']
isbnList = []
# grabbing all the books to read
for book in parsed_Library:
title = book['title']
isbn = book['isbn']
if str(Year) in title and Level in title and 'CFA' in title:
isbnList.append({'title': title, 'isbn': int(isbn)})
# Going to each book's table of contents first
for book in isbnList:
bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
tableOfContents = s.get(bookUrl).text
parsed_tableOfContents = json.loads(tableOfContents)
readingList = []
readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
newLocation = startingLocation + '\CFABlueBoxes' + r'\ ' + book['title']
if not os.path.exists(newLocation):
os.makedirs(newLocation)
os.chdir(newLocation)
# Grabbing the links to all of the readings
for reading in parsed_tableOfContents:
if readingRegex.search(reading['title']):
readingList.append(reading)
for subReading in readingList:
newLink = subReading['path']
newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
reading = s.get(newURL).text
soupReading = BeautifulSoup(reading, "html.parser")
for span_tag in soupReading.findAll('span'):
span_tag.replace_with('')
title = subReading['title']
#title = " ".join(re.findall("[a-zA-Z]+", title))
print('Now Scraping Reading: %s' % title)
# Grabbing all of the blue boxes which are denoted by "figure" in the HTML
figures = soupReading.findAll("figure", {"class": "example"})
if len(figures) >= 1:
readingFile = open('%s.html' % title, 'wb')
for figure in figures:
for image_tag in figure.findAll('img'):
image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + image_tag['src']
readingFile.write(figure.encode('UTF-8'))
readingFile.close()
else:
continue
print('Book {} Done! Enjoy!'.format(book))
print('All books for year {0} and Level {1} done! Enjoy!'.format(Year, Level))