-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdownloader.py
More file actions
97 lines (80 loc) · 3.45 KB
/
downloader.py
File metadata and controls
97 lines (80 loc) · 3.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from utils import extract_imported_css
from urllib.parse import urlparse, urljoin
from logger import log_info, log_warn, log_fail
from aiohttp import ClientSession
from pathlib import Path
import asyncio
import config
async def fetch_html_async(url: str, session: ClientSession):
"""
Fetches HTML or CSS content from a URL asynchronously.
Skips files that are not 'text/html' or 'text/css'.
"""
headers = config.HEADERS
try:
async with session.get(url, headers=headers, timeout=config.TIMEOUT, ssl=False) as response:
content_type = response.headers.get('Content-Type', '') if hasattr(response.headers, 'get') else ''
if 'text/html' in content_type or 'text/css' in content_type:
text = await response.text()
return text, dict(response.headers)
else:
log_warn(f"[SKIP] Non-text content at {url} ({content_type})")
return None, None
except Exception as e:
log_fail(f"[ERROR] Failed to fetch {url}: {e}")
return None, None
async def save_asset_async(
url: str,
tag: str,
output_dir: Path,
session: ClientSession,
base_url: str = None,
seen_assets: set = None
):
"""
Downloads an asset (CSS, JS, IMG...) and saves it to disk.
Prevents duplicate downloads using 'seen_assets'.
Also handles nested CSS @import calls recursively.
"""
if url.startswith("data:"):
log_warn(f"[SKIP] Skipping base64 asset: {url[:40]}...")
return
# check if asset exists already
if seen_assets is not None:
parsed_url = urlparse(url)
clean_url = parsed_url._replace(query="", fragment="").geturl()
if clean_url in seen_assets:
return
seen_assets.add(clean_url)
headers = config.HEADERS
parsed = urlparse(url)
file_name = Path(parsed.path).name or 'asset'
subdir = {
'link': 'assets/css',
'script': 'assets/js',
'img': 'assets/img'
}.get(tag, 'assets/other')
full_path = output_dir / subdir / file_name
full_path.parent.mkdir(parents=True, exist_ok=True)
for attempt in range(config.RETRY_COUNT):
try:
async with session.get(url, headers=headers, timeout=config.TIMEOUT, ssl=False) as response:
response.raise_for_status()
content_type = response.headers.get('Content-Type', '') if hasattr(response.headers, 'get') else ''
content = await response.read()
with open(full_path, 'wb') as f:
f.write(content)
log_info(f"[ASSET] Saved {file_name} ({tag})")
# Handle @import inside CSS
if tag == 'link' and full_path.suffix == '.css' and 'text/css' in content_type:
css_text = content.decode('utf-8', errors='ignore')
imported_urls = extract_imported_css(css_text)
for rel_url in imported_urls:
abs_url = urljoin(base_url or url, rel_url)
await save_asset_async(abs_url, 'link', output_dir, session, url, seen_assets)
break
except Exception as e:
log_warn(f"[WARN] Attempt {attempt+1}/{config.RETRY_COUNT} failed for {url}: {e}")
await asyncio.sleep(2)
else:
log_fail(f"[FAIL] Could not download: {url}")