-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathGNews_With_Selenium.py
More file actions
73 lines (67 loc) · 2.65 KB
/
GNews_With_Selenium.py
File metadata and controls
73 lines (67 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# package installations
# pip install googlenewsdecoder
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from googlenewsdecoder import new_decoderv1
import pprint
import base64
import re
# # Setup WebDriver for Chrome
# # driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
# driver = webdriver.Chrome()
#
# # Go to Google News (for US)
# # url = "https://news.google.com/home?hl=en-US&gl=US&ceid=US:en"
# # url = "https://news.google.com/home?hl=en-IN&gl=IN&ceid=IN:en"
# url = r"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFZxYUdjU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
# driver.get(url)
#
# # Allow some time for the page to load dynamic content
# driver.implicitly_wait(5)
#
# # Extract headlines and URLs
# headlines = driver.find_elements(By.XPATH, '//h3/a')
#
# for headline in headlines:
# title = headline.text
# link = headline.get_attribute('href')
# print(f"Title: {title}\nLink: {link}\n")
#
# # Close the driver after scraping
# driver.quit()
# Set up Chrome options
options = Options()
options.add_argument("--headless") # Run in headless mode
driver = webdriver.Chrome(options=options)
url = r"https://news.google.com/topics/CAAqKggKIiRDQkFTRlFvSUwyMHZNRFZxYUdjU0JXVnVMVWRDR2dKSlRpZ0FQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen"
driver.get(url)
driver.implicitly_wait(5)
headline_componet = driver.find_elements(By.CLASS_NAME, 'IBr9hb')
news_source_componet = headline_componet[0].find_elements(By.CLASS_NAME, 'vr1PYe') #
# news_source_componet = headline_componet[0].find_elements(By.CLASS_NAME, 'a7P8l') #
article_componet = headline_componet[0].find_elements(By.CLASS_NAME, 'gPFEn')
# pprint.pprint(article[0].text)
# pprint.pprint(article[0].get_attribute('href'))
headline = article_componet[0].text
headline_url = article_componet[0].get_attribute('href')
news_source = news_source_componet[0].get_attribute('innerText')
print("Headline: {}".format(headline))
print("Headline-URL: {}".format(headline_url))
print("news_source: {}".format(news_source))
driver.quit()
interval_time = 5 # default interval is None, if not specified
try:
decoded_url = new_decoderv1(headline_url, interval=interval_time)
if decoded_url.get("status"):
print("Decoded URL:", decoded_url["decoded_url"])
open_url = decoded_url["decoded_url"]
else:
print("Error:", decoded_url["message"])
except Exception as e:
print(f"Error occurred: {e}")
# driver = webdriver.Chrome()
# driver.get(open_url)
# driver.implicitly_wait(5)