-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
90 lines (75 loc) · 3.05 KB
/
main.py
File metadata and controls
90 lines (75 loc) · 3.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
import os
import time
from urllib.parse import urljoin, urlparse, urlsplit
import requests
from bs4 import BeautifulSoup
from pathvalidate import sanitize_filename
def check_for_redirect(response):
if response.history:
raise requests.HTTPError('HTTP Error')
def download_txt(url, filename, folder='Books/'):
response = requests.get(url)
response.raise_for_status()
check_for_redirect(response)
path_to_file = os.path.join(folder, f'{sanitize_filename(filename)}.txt')
with open(path_to_file, 'wb') as file:
file.write(response.content)
return path_to_file
def download_image(url, folder='images/'):
url_image = urljoin('https://tululu.org/', url)
response = requests.get(url_image)
response.raise_for_status()
check_for_redirect(response)
file_name = urlsplit(url_image).path.split('/')[-1]
image_path = os.path.join(folder, file_name)
with open(image_path, 'wb') as file:
file.write(response.content)
def parse_book_page(response, url):
soup = BeautifulSoup(response.text, 'lxml')
book_html = soup.find('div', id='content').find('h1')
book_name_author = book_html.text.split('::')
book_name = book_name_author[0].strip()
book_author = book_name_author[1].strip()
book_genre = soup.find('div', id='content').find('span', class_='d_book').find('a').text
comments = [comment.find('span').text for comment in soup.find_all('div', class_='texts')]
image_tag = soup.find('div', class_='bookimage').find('img')['src']
image_url = urljoin(url, image_tag)
book_tag = soup.find('div', id='content').find_all('a')[10]['href']
book_url = urljoin(url, book_tag)
book = {
'book_name': book_name,
'book_author': book_author,
'book_genre': book_genre,
'comments': comments,
'image_url': image_url,
'book_url': book_url,
}
return book
def main():
path_book = './Books'
os.makedirs(path_book, exist_ok=True)
path_image = './images'
os.makedirs(path_image, exist_ok=True)
parser = argparse.ArgumentParser(
description='Скрипт скачивает книги с сайта https://tululu.org.'
)
parser.add_argument('-first_book', help='Первая книга', type=int, default=1)
parser.add_argument('-last_book', help='Последня книга', type=int, default=11)
args = parser.parse_args()
for book_num in range(args.first_book, args.last_book + 1):
url = f'https://tululu.org/b{book_num}/'
try:
response = requests.get(url)
response.raise_for_status()
check_for_redirect(response)
book = parse_book_page(response, url)
download_txt(book['book_url'], f"{i}. {book['book_name']}")
download_image(book['image_url'])
except requests.HTTPError as err:
print(err.args[0])
except requests.ConnectionError as err:
print(f'Сбой соединения.Ошибка {err}')
time.sleep(3)
if __name__ == '__main__':
main()