-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcore.py
More file actions
126 lines (92 loc) · 3.57 KB
/
core.py
File metadata and controls
126 lines (92 loc) · 3.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
from pathlib import Path
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from downloader import fetch_html_async, save_asset_async
from utils import normalize_url, is_internal_link
from logger import log_info
import config
# Tracks already visited pages to avoid redundant crawling
visited = set()
def extract_links(html: str, base_url: str) -> set:
"""
Extracts all valid internal <a> tag href links from the given HTML.
"""
soup = BeautifulSoup(html, 'html.parser')
links = set()
for a in soup.find_all('a', href=True):
href = a['href']
if href.startswith(('mailto:', 'tel:', '#', 'javascript:')):
continue
full_url = urljoin(base_url, href)
links.add(full_url)
return links
def save_html(content: str, output_dir: Path, filename: str):
"""
Saves HTML content to the specified output directory.
"""
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / filename, 'w', encoding='utf-8') as f:
f.write(content)
def save_headers(headers: dict, output_dir: Path):
"""
Saves HTTP response headers to a file for debugging/inspection.
"""
with open(output_dir / 'headers.txt', 'w', encoding='utf-8') as f:
for k, v in headers.items():
f.write(f"{k}: {v}\n")
async def parse_and_download_assets(html: str, base_url: str, output_dir: Path, session, seen_assets: set, tag_map=None):
"""
Parses HTML and asynchronously downloads all discovered assets (CSS, JS, IMG).
Skips fonts and already downloaded resources using 'seen_assets'.
"""
soup = BeautifulSoup(html, 'html.parser')
tags = tag_map or {
'link': 'href',
'script': 'src',
'img': 'src'
}
for tag, attr in tags.items():
for el in soup.find_all(tag):
if tag == 'link':
rel = el.get('rel')
if rel is None or 'stylesheet' not in rel:
continue
asset_url = el.get(attr)
if not asset_url:
continue
full_url = urljoin(base_url, asset_url)
if 'fonts.googleapis.com' in full_url or 'fonts.gstatic.com' in full_url:
continue
await save_asset_async(full_url, tag, output_dir, session, base_url, seen_assets)
async def crawl_page_async(
url: str,
base_url: str,
output_dir: Path,
base_netloc: str,
index_list_file: Path,
session,
seen_assets: set
):
"""
Recursively crawls and downloads internal pages and their assets.
Prevents duplicate visits and tracks asset usage.
"""
norm_url = normalize_url(url)
if norm_url in visited:
return
visited.add(norm_url)
log_info(f"[CRAWL] {url}")
html, headers = await fetch_html_async(url, session)
if not html:
return
parsed = urlparse(url)
filename = 'index.html' if parsed.path in ('', '/') else parsed.path.strip('/').replace('/', '_') + '.html'
save_html(html, output_dir, filename)
save_headers(headers, output_dir)
await parse_and_download_assets(html, base_url, output_dir, session, seen_assets)
with open(index_list_file, 'a', encoding='utf-8') as f:
f.write(filename + '\n')
links = extract_links(html, url)
for link in links:
if is_internal_link(link, base_netloc):
await crawl_page_async(link, base_url, output_dir, base_netloc, index_list_file, session, seen_assets)