BookShelfScrape v1.1

AkertProjects · web-flow · commit 292c642dc469 · 2017-01-28T18:36:59.000-07:00
diff --git a/BookShelfScrape v1.1 b/BookShelfScrape v1.1
@@ -0,0 +1,80 @@
+#! Python
+# Goes to bookshelf and downloads blue box questions from the CFA textbooks
+
+import requests, json, re, os
+from bs4 import BeautifulSoup
+
+username = input('Please type in your E-mail: ')
+password = input('Please type in your password: ')
+
+# Provide username and password
+payload = {
+    'user[email]': username,
+    'user[password]': password
+}
+
+# Specifying year and level of books
+Year = input('What year of books do you want?: ')
+Level = input('What Level? (I,II,III): ')
+
+# Creating a folder for the documents
+startingLocation = os.path.abspath(os.curdir)
+location = startingLocation + '\CFABlueBoxes'
+if not os.path.exists(location):
+    os.makedirs(location)
+os.chdir(location)
+print('ALL FILES WILL BE SAVED IN: %s' % location)
+
+# Logging in and keeping a session open
+with requests.Session() as s:
+    p = s.post('https://jigsaw.vitalsource.com/login', params=payload)
+    Library = p.text
+    parsed_Library = json.loads(Library)['books']
+    my_regex = re.compile(str(Year) + r'\sCFA\sLevel\s' + Level + r'\s')
+    isbnList = []
+    # grabbing all the books to read
+    for book in parsed_Library:
+        title = book['title']
+        isbn = book['isbn']
+        if my_regex.search(title):
+            isbnList.append({'title':title, 'isbn':int(isbn)})
+    # Going to each book's table of contents first
+    for book in isbnList:
+        bookUrl = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/toc'
+        tableOfContents = s.get(bookUrl).text
+        parsed_tableOfContents = json.loads(tableOfContents)
+        readingList = []
+        readingRegex = re.compile(r'Reading\s{1,10}\d{1,3}')
+        newLocation = startingLocation + '\CFABlueBoxes' + r'\ ' + book['title']
+        if not os.path.exists(newLocation):
+            os.makedirs(newLocation)
+        os.chdir(newLocation)
+        # Grabbing the links to all of the readings
+        for reading in parsed_tableOfContents:
+            if readingRegex.search(reading['title']):
+                readingList.append(reading)
+        for subReading in readingList:
+            newLink = subReading['path']
+            newURL = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub' + str(newLink)
+            reading = s.get(newURL).text
+            soupReading = BeautifulSoup(reading, "html.parser")
+            for span_tag in soupReading.findAll('span'):
+                span_tag.replace_with('')
+            title = subReading['title']
+            #title = " ".join(re.findall("[a-zA-Z]+", title))
+            print('Now Scraping Reading: %s' % title)
+            # Grabbing all of the blue boxes which are denoted by "figure" in the HTML
+            # Trying to also save them off as word documents, with both text and tables
+            figures = soupReading.findAll("figure", {"class": "example"})
+            if len(figures) >= 1:
+                readingFile = open('%s.html' % title, 'wb')
+                for figure in figures:
+                    for image_tag in figure.findAll('img'):
+                        image_tag['src'] = 'https://jigsaw.vitalsource.com/books/' + str(book['isbn']) + '/epub/OEBPS/' + image_tag['src']
+                    readingFile.write(figure.encode('UTF-8'))
+                readingFile.close()
+            else:
+                continue
+        print('Done! Enjoy!')
+
+