-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathseleniumDemo.py
More file actions
46 lines (36 loc) · 1.3 KB
/
seleniumDemo.py
File metadata and controls
46 lines (36 loc) · 1.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
# 瀏覽器設置
chrome_options = Options()
chrome_options.add_argument('--incognito')
chrome_options.add_argument('--headless') #規避google bug
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0')
browser = webdriver.Chrome(chrome_options=chrome_options)
# Query 設置
query = 'selenium' #搜尋的字元
browser.get("https://www.google.com/search?q={}".format(query))
next_page_times = 10
# 爬蟲
for _page in range(next_page_times):
soup = BeautifulSoup(browser.page_source, 'html.parser')
content = soup.prettify()
# Get titles and urls
titles = re.findall('<h3 class="[\w\d]{6} [\w\d]{6}">\n\ +(.+)', content)
urls = re.findall('<div class="r">\ *\n\ *<a href="(.+)" onmousedown', soup.prettify())
for n in range(min(len(titles), len(urls))):
print(titles[n], urls[n])
# Wait
time.sleep(5)
# Turn to the next page
try:
browser.find_element_by_link_text('下一頁').click()
except:
print('Search Early Stopping.')
browser.close()
exit()
# 關閉瀏覽器
browser.close()