File tree Expand file tree Collapse file tree 2 files changed +68
-0
lines changed
Expand file tree Collapse file tree 2 files changed +68
-0
lines changed Original file line number Diff line number Diff line change 1+ from bs4 import BeautifulSoup
2+ import requests
3+ import csv
4+
5+ source = requests .get ('http://coreyms.com' ).text
6+
7+ soup = BeautifulSoup (source , 'lxml' )
8+
9+ csv_file = open ('cms_scrape.csv' , 'w' )
10+
11+ csv_writer = csv .writer (csv_file )
12+ csv_writer .writerow (['headline' , 'summary' , 'video_link' ])
13+
14+ for article in soup .find_all ('article' ):
15+ headline = article .h2 .a .text
16+ print (headline )
17+
18+ summary = article .find ('div' , class_ = 'entry-content' ).p .text
19+ print (summary )
20+
21+ try :
22+ vid_src = article .find ('iframe' , class_ = 'youtube-player' )['src' ]
23+
24+ vid_id = vid_src .split ('/' )[4 ]
25+ vid_id = vid_id .split ('?' )[0 ]
26+
27+ yt_link = f'https://youtube.com/watch?v={ vid_id } '
28+ except Exception as e :
29+ yt_link = None
30+
31+ print (yt_link )
32+
33+ print ()
34+
35+ csv_writer .writerow ([headline , summary , yt_link ])
36+
37+ csv_file .close ()
Original file line number Diff line number Diff line change 1+ <!doctype html>
2+ < html class ="no-js " lang ="">
3+ < head >
4+ < title > Test - A Sample Website</ title >
5+ < meta charset ="utf-8 ">
6+ < link rel ="stylesheet " href ="css/normalize.css ">
7+ < link rel ="stylesheet " href ="css/main.css ">
8+ </ head >
9+ < body >
10+ < h1 id ='site_title '> Test Website</ h1 >
11+ < hr > </ hr >
12+ < div class ="article ">
13+ < h2 > < a href ="article_1.html "> Article 1 Headline</ a > </ h2 >
14+ < p > This is a summary of article 1</ p >
15+ </ div >
16+ < hr > </ hr >
17+ < div class ="article ">
18+ < h2 > < a href ="article_2.html "> Article 2 Headline</ a > </ h2 >
19+ < p > This is a summary of article 2</ p >
20+ </ div >
21+ < hr > </ hr >
22+
23+ < div class ='footer '>
24+ < p > Footer Information</ p >
25+ </ div >
26+
27+ < script src ="js/vendor/modernizr-3.5.0.min.js "> </ script >
28+ < script src ="js/plugins.js "> </ script >
29+ < script src ="js/main.js "> </ script >
30+ </ body >
31+ </ html >
You can’t perform that action at this time.
0 commit comments