-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
91 lines (73 loc) · 2.77 KB
/
main.py
File metadata and controls
91 lines (73 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import asyncio
import sys
from pathlib import Path
from urllib.parse import urlparse
from core import crawl_page_async
from rewrite import rewrite_asset_paths
from logger import log_info, log_fail
import config
import time
import aiohttp
# 🔧 Fix dla aiodns na Windows
if sys.platform.startswith("win"):
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
def get_file_stats(output_dir: Path) -> dict:
"""Returns count and size of files by type."""
stats = {
'css': 0,
'js': 0,
'img': 0,
'other': 0,
'total_files': 0,
'total_size': 0.0 # in MB
}
for file in output_dir.rglob('*'):
if file.is_file():
ext = file.suffix.lower()
stats['total_files'] += 1
stats['total_size'] += file.stat().st_size / (1024 * 1024)
if ext in ['.css']:
stats['css'] += 1
elif ext in ['.js']:
stats['js'] += 1
elif ext in ['.png', '.jpg', '.jpeg', '.svg', '.gif', '.webp']:
stats['img'] += 1
else:
stats['other'] += 1
return stats
async def run_webdumper():
print("============================")
print(" WebDumper ")
print(" Ex41T ")
print("============================")
url = input("URL >>> ").strip()
if not url.startswith("http"):
log_fail("URL must start with http:// or https://")
return
parsed = urlparse(url)
base_url = url if url.endswith('/') else url + '/'
host = parsed.netloc.replace(':', '_')
target = Path(config.OUTPUT_DIR) / host
index_file = target / 'index_list.txt'
if index_file.exists():
index_file.unlink()
start_time = time.time()
log_info(f"Starting crawl for: {url}")
async with aiohttp.ClientSession() as session:
seen_assets = set()
await crawl_page_async(url, base_url, target, parsed.netloc, index_file, session, seen_assets)
log_info("Rewriting asset paths...")
rewrite_asset_paths(target, index_file)
elapsed = time.time() - start_time
stats = get_file_stats(target)
print("\n[✔] Dump complete. Saved to:", target.resolve())
print(f"\n[📊] Dump Stats:")
print(f" • Duration : {elapsed:.2f} seconds")
print(f" • Total files : {stats['total_files']}")
print(f" • Total size : {stats['total_size']:.2f} MB")
print(f" • CSS files : {stats['css']}")
print(f" • JS files : {stats['js']}")
print(f" • Image files : {stats['img']}")
print(f" • Other files : {stats['other']}")
if __name__ == "__main__":
asyncio.run(run_webdumper())