-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathWebMiner.py
More file actions
454 lines (391 loc) · 18.9 KB
/
WebMiner.py
File metadata and controls
454 lines (391 loc) · 18.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
#!/usr/bin/env python3
# WebMiner v1.0 - Enhanced Web Scraper/Searcher
# FINAL FIXED VERSION - Colorama init order fixed
import os
import sys
import time
import json
import csv
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime
from typing import Optional, List, Dict
# Initialize colorama FIRST before any colored output
from colorama import Fore, Style, init
init(autoreset=True)
try:
from tqdm import tqdm
except ImportError:
# Fallback if tqdm not available
def tqdm(iterable, desc="Progress"):
return iterable
# Try multiple import methods for googlesearch
search = None
GOOGLESEARCH_AVAILABLE = False
try:
# Method 1: Standard import
from googlesearch import search
GOOGLESEARCH_AVAILABLE = True
print(Fore.GREEN + "[✓] Google search module loaded successfully" + Style.RESET_ALL)
except ImportError:
try:
# Method 2: Import module then get function
import googlesearch
if hasattr(googlesearch, 'search'):
search = googlesearch.search
GOOGLESEARCH_AVAILABLE = True
print(Fore.GREEN + "[✓] Google search module loaded (method 2)" + Style.RESET_ALL)
except ImportError:
pass
if not GOOGLESEARCH_AVAILABLE:
print(Fore.YELLOW + "[!] Google search unavailable. Install: pip install googlesearch-python" + Style.RESET_ALL)
BANNER1 = '''
░██╗░░░░░░░██╗███████╗██████╗░███╗░░░███╗██╗███╗░░██╗███████╗██████╗░
░██║░░██╗░░██║██╔════╝██╔══██╗████╗░████║██║████╗░██║██╔════╝██╔══██╗
░╚██╗████╗██╔╝█████╗░░██████╦╝██╔████╔██║██║██╔██╗██║█████╗░░██████╔╝
░░████╔═████║░██╔══╝░░██╔══██╗██║╚██╔╝██║██║██║╚████║██╔══╝░░██╔══██╗
░░╚██╔╝░╚██╔╝░███████╗██████╦╝██║░╚═╝░██║██║██║░╚███║███████╗██║░░██║
░░░╚═╝░░░╚═╝░░╚══════╝╚═════╝░╚═╝░░░░░╚═╝╚═╝╚═╝░░╚══╝╚══════╝╚═╝░░╚═╝
'''
BANNER2 = '''
WebMiner v1.0 - Web Scraping & Search Tool
'''
HISTORY = []
def clear_screen():
"""Clear the terminal screen"""
os.system('cls' if os.name == 'nt' else 'clear')
def banner():
"""Display the WebMiner banner"""
print(Fore.CYAN + BANNER1 + Style.RESET_ALL)
print(Fore.MAGENTA + BANNER2 + Style.RESET_ALL)
def is_valid_url(url):
"""Validate URL format"""
parsed = urlparse(url)
return all([parsed.scheme, parsed.netloc])
def retry_request(url, retries=3, delay=2):
"""Make HTTP request with retry logic"""
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
for attempt in range(retries):
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response
except requests.RequestException as e:
print(Fore.RED + f"[!] Attempt {attempt+1} failed: {str(e)}" + Style.RESET_ALL)
if attempt < retries - 1:
print(Fore.YELLOW + f"[*] Retrying in {delay} seconds..." + Style.RESET_ALL)
time.sleep(delay)
raise requests.RequestException("Max retries exceeded")
def print_filtered_matches(content, keyword):
"""Filter and display content matching keyword"""
matches = [line for line in content if keyword.lower() in line.lower()]
print(Fore.CYAN + f"\n{'='*60}" + Style.RESET_ALL)
print(Fore.CYAN + f"Filtered Results ({len(matches)} matches for '{keyword}'):" + Style.RESET_ALL)
print(Fore.CYAN + f"{'='*60}" + Style.RESET_ALL)
for i, match in enumerate(matches, 1):
print(f"{i}. {match[:200]}...")
def export_content(content: List[str], export_format: str, output_name: Optional[str]):
"""Export scraped content to file"""
name = output_name or "mined_content"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if export_format == 'json':
filename = f"{name}_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump({"data": content, "timestamp": timestamp, "count": len(content)}, f, indent=2)
print(Fore.BLUE + f"[+] Content saved to {filename}" + Style.RESET_ALL)
elif export_format == 'csv':
filename = f"{name}_{timestamp}.csv"
with open(filename, "w", encoding="utf-8", newline='') as f:
writer = csv.writer(f)
writer.writerow(["Index", "Content", "Length"])
for idx, row in enumerate(content, 1):
writer.writerow([idx, row, len(row)])
print(Fore.BLUE + f"[+] Content saved to {filename}" + Style.RESET_ALL)
elif export_format == 'txt':
filename = f"{name}_{timestamp}.txt"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"WebMiner Export - {timestamp}\n")
f.write(f"{'='*60}\n\n")
f.write("\n\n".join([f"[{i}] {item}" for i, item in enumerate(content, 1)]))
print(Fore.BLUE + f"[+] Content saved to {filename}" + Style.RESET_ALL)
def export_summary(url: str, title: str, description: str, keywords: str, counts: Dict[str, int], output_name: Optional[str]):
"""Export page summary as JSON"""
summary = {
"url": url,
"title": title,
"meta_description": description,
"meta_keywords": keywords,
"element_counts": counts,
"timestamp": datetime.utcnow().isoformat(),
"mined_by": "WebMiner v1.0"
}
name = output_name or "mined_content"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{name}_summary_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2)
print(Fore.BLUE + f"[+] Summary saved to {filename}" + Style.RESET_ALL)
def download_images(images, base_url: str):
"""Download all images from a webpage"""
if not images:
print(Fore.YELLOW + "[!] No images found to download." + Style.RESET_ALL)
return
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
img_dir = f"images_{timestamp}"
os.makedirs(img_dir, exist_ok=True)
successful = 0
for img in tqdm(images, desc="Downloading images"):
src = img.get('src')
if not src:
continue
# Handle relative URLs
if src.startswith('//'):
img_url = 'https:' + src
elif src.startswith(('http://', 'https://')):
img_url = src
else:
img_url = base_url.rstrip('/') + '/' + src.lstrip('/')
try:
img_data = requests.get(img_url, timeout=10).content
filename = os.path.basename(src.split('?')[0]) or f"image_{successful}.jpg"
filepath = os.path.join(img_dir, filename)
with open(filepath, 'wb') as f:
f.write(img_data)
successful += 1
except Exception:
continue
print(Fore.GREEN + f"[+] Downloaded {successful}/{len(images)} images to {img_dir}/" + Style.RESET_ALL)
def scrape_url(url, **kwargs):
"""Main web scraping function"""
HISTORY.append({'type':'scrape', 'url':url, 'time':datetime.now().isoformat()})
if not is_valid_url(url):
print(Fore.RED + "[!] Invalid URL format. Use format: https://example.com" + Style.RESET_ALL)
return
print(Fore.CYAN + f"\n[*] Scraping: {url}" + Style.RESET_ALL)
try:
response = retry_request(url)
except Exception as e:
print(Fore.RED + f"[!] Error fetching URL: {e}" + Style.RESET_ALL)
return
soup = BeautifulSoup(response.content, 'html.parser')
content = []
# Extract title and meta information
title = soup.title.string.strip() if soup.title else "(No Title Found)"
meta_desc = soup.find("meta", attrs={"name":"description"})
meta_keywords = soup.find("meta", attrs={"name":"keywords"})
desc = meta_desc["content"].strip() if meta_desc and meta_desc.get("content") else "(No Meta Description)"
keywords = meta_keywords["content"].strip() if meta_keywords and meta_keywords.get("content") else "(No Meta Keywords)"
# Display page info
print(Fore.MAGENTA + f"\n{'='*60}" + Style.RESET_ALL)
print(Fore.MAGENTA + f"Page Title: {title}" + Style.RESET_ALL)
print(Fore.MAGENTA + f"Meta Description: {desc[:100]}..." + Style.RESET_ALL)
print(Fore.MAGENTA + f"{'='*60}" + Style.RESET_ALL)
# Extract all elements
paragraphs = soup.find_all('p')
links = soup.find_all('a')
headings = soup.find_all(['h1','h2','h3','h4','h5','h6'])
images = soup.find_all('img')
# Extract text from paragraphs
text_data = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
content.extend(text_data)
# Display text content (limited)
if text_data:
print(Fore.GREEN + f"\n--- Page Text (showing first 5) ---" + Style.RESET_ALL)
for i, t in enumerate(text_data[:5], 1):
print(f"{i}. {t[:150]}...")
if len(text_data) > 5:
print(Fore.YELLOW + f"... and {len(text_data)-5} more paragraphs" + Style.RESET_ALL)
# Extract and display headings
if headings:
print(Fore.GREEN + f"\n--- Headings ({len(headings)} found) ---" + Style.RESET_ALL)
for i, h in enumerate(headings[:10], 1):
h_text = h.get_text(strip=True)
if h_text:
print(f"{h.name.upper()}: {h_text}")
content.append(f"{h.name}: {h_text}")
# Extract links
if links:
print(Fore.GREEN + f"\n--- Links (showing first 10 of {len(links)}) ---" + Style.RESET_ALL)
for link in links[:10]:
href = link.get('href')
if href and href.startswith('http'):
print(f" → {href}")
content.append(href)
# Image sources
if images:
print(Fore.GREEN + f"\n--- Images ({len(images)} found) ---" + Style.RESET_ALL)
for i, img in enumerate(images[:5], 1):
src = img.get('src')
alt = img.get('alt', 'No alt text')
if src:
print(f"{i}. {src[:80]}... (alt: {alt[:30]})")
content.append(src)
# Summary statistics
counts = {
'paragraphs': len(paragraphs),
'headings': len(headings),
'links': len(links),
'images': len(images)
}
print(Fore.CYAN + f"\n{'='*60}" + Style.RESET_ALL)
print(Fore.CYAN + "Summary Statistics:" + Style.RESET_ALL)
print(Fore.CYAN + f" • Paragraphs: {counts['paragraphs']}" + Style.RESET_ALL)
print(Fore.CYAN + f" • Headings: {counts['headings']}" + Style.RESET_ALL)
print(Fore.CYAN + f" • Links: {counts['links']}" + Style.RESET_ALL)
print(Fore.CYAN + f" • Images: {counts['images']}" + Style.RESET_ALL)
print(Fore.CYAN + f"{'='*60}\n" + Style.RESET_ALL)
# Export summary
export_summary(url, title, desc, keywords, counts, kwargs.get('output'))
# Download images
if kwargs.get('download_images', False) or (not kwargs.get('headless', False) and input("\nDownload all images? (y/n): ").strip().lower()=='y'):
download_images(images, url)
# Filter content
if 'filter' in kwargs and kwargs['filter']:
print_filtered_matches(content, kwargs['filter'])
elif not kwargs.get('headless', False):
keyword_search = input("\nEnter keyword to filter content (or press Enter to skip): ").strip()
if keyword_search:
print_filtered_matches(content, keyword_search)
# Export content
if content:
fmt = kwargs.get('format') or ("skip" if kwargs.get('headless', False) else input("\nSave content? (json/csv/txt/skip): ").strip().lower())
if fmt in ['json','csv','txt']:
export_content(content, fmt, kwargs.get('output'))
def search_term(term):
"""Perform Google search"""
if not GOOGLESEARCH_AVAILABLE or search is None:
print(Fore.RED + "[!] Google search is unavailable." + Style.RESET_ALL)
print(Fore.YELLOW + "[!] Install with: pip install googlesearch-python" + Style.RESET_ALL)
print(Fore.YELLOW + "[!] If already installed, try: pip uninstall googlesearch-python && pip install googlesearch-python" + Style.RESET_ALL)
return
print(Fore.CYAN + f"\n{'='*60}" + Style.RESET_ALL)
print(Fore.CYAN + f"Google Search Results for: '{term}'" + Style.RESET_ALL)
print(Fore.CYAN + f"{'='*60}\n" + Style.RESET_ALL)
try:
results = list(search(term, num_results=20))
if not results:
print(Fore.YELLOW + "[i] No results found." + Style.RESET_ALL)
return
for i, result in enumerate(results, start=1):
print(f"{i:2}. {result}")
print(Fore.CYAN + f"\n{'='*60}" + Style.RESET_ALL)
save = input("\nSave results to file? (y/n): ").strip().lower()
if save == 'y':
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"search_results_{timestamp}.txt"
with open(filename, "w", encoding="utf-8") as f:
f.write(f"Google Search: {term}\n")
f.write(f"Timestamp: {timestamp}\n")
f.write(f"{'='*60}\n\n")
f.write("\n".join([f"{i}. {r}" for i, r in enumerate(results, 1)]))
print(Fore.BLUE + f"[+] Results saved to {filename}" + Style.RESET_ALL)
except Exception as e:
print(Fore.RED + f"[!] Error during search: {e}" + Style.RESET_ALL)
def show_history():
"""Display command history"""
if not HISTORY:
print(Fore.YELLOW + "\n[i] No history yet." + Style.RESET_ALL)
return
print(Fore.YELLOW + f"\n{'='*60}" + Style.RESET_ALL)
print(Fore.YELLOW + "Command History" + Style.RESET_ALL)
print(Fore.YELLOW + f"{'='*60}" + Style.RESET_ALL)
for i, entry in enumerate(HISTORY, 1):
entry_type = entry['type'].capitalize()
target = entry.get('url', '') or entry.get('term', '')
timestamp = entry['time'].split('T')[1].split('.')[0] if 'T' in entry['time'] else entry['time']
print(f"{i:2}. [{entry_type}] {target} @ {timestamp}")
print(Fore.YELLOW + f"{'='*60}\n" + Style.RESET_ALL)
def show_help():
"""Display help information"""
help_text = f"""
{Fore.CYAN}{'='*60}
WebMiner v1.0 - Help & Usage Guide
{'='*60}{Style.RESET_ALL}
{Fore.GREEN}INTERACTIVE MENU OPTIONS:{Style.RESET_ALL}
[1] Scrape URL - Extract content from any webpage
[2] Google Search - Search Google and get top results
[3] Show History - View your command history
[h] Help - Show this help message
[c] Clear Screen - Clear terminal and show banner
[x] Exit - Exit WebMiner
{Fore.GREEN}COMMAND-LINE ARGUMENTS:{Style.RESET_ALL}
--url URL Scrape specified URL
--search TERM Perform Google search
--format {{json,csv,txt}} Export format
--download-images Download all images from page
--output NAME Output filename (without extension)
--filter KEYWORD Filter content by keyword
--headless Non-interactive mode
--log PATH Save terminal output to file
{Fore.GREEN}EXAMPLES:{Style.RESET_ALL}
python WebMiner.py --url https://example.com --format json
python WebMiner.py --search "python tutorials"
python WebMiner.py --url https://site.com --download-images --output mydata
{Fore.GREEN}DEPENDENCIES:{Style.RESET_ALL}
pip install requests beautifulsoup4 colorama tqdm googlesearch-python
{Fore.CYAN}{'='*60}{Style.RESET_ALL}
"""
print(help_text)
def main_menu():
"""Main interactive menu"""
banner()
print(Fore.GREEN + "[*] Type 'h' for help and usage guide" + Style.RESET_ALL)
while True:
print(f"\n{Fore.YELLOW}[1]{Style.RESET_ALL} Scrape URL {Fore.YELLOW}[2]{Style.RESET_ALL} Google Search {Fore.YELLOW}[3]{Style.RESET_ALL} Show History")
print(f"{Fore.YELLOW}[h]{Style.RESET_ALL} Help {Fore.YELLOW}[c]{Style.RESET_ALL} Clear Screen {Fore.YELLOW}[x]{Style.RESET_ALL} Exit")
choice = input(f"\n{Fore.CYAN}[?]{Style.RESET_ALL} ").strip().lower()
if choice == '1':
url = input(f"{Fore.GREEN}Enter URL to scrape: {Style.RESET_ALL}").strip()
if url:
scrape_url(url)
elif choice == '2':
term = input(f"{Fore.GREEN}Enter search term: {Style.RESET_ALL}").strip()
if term:
HISTORY.append({'type':'search', 'term':term, 'time':datetime.now().isoformat()})
search_term(term)
elif choice == '3':
show_history()
elif choice == 'h':
show_help()
elif choice == 'c':
clear_screen()
banner()
elif choice == 'x':
print(Fore.MAGENTA + "\n[x] Exiting WebMiner... Goodbye!" + Style.RESET_ALL)
break
else:
print(Fore.RED + "[!] Invalid option. Type 'h' for help." + Style.RESET_ALL)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='WebMiner v1.0 - Advanced Web Scraper and Search Tool',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python WebMiner.py --url https://example.com --format json
python WebMiner.py --search "web scraping" --output results
python WebMiner.py --url https://site.com --download-images
"""
)
parser.add_argument('--url', help='Scrape specified URL')
parser.add_argument('--search', help='Google Search for a term')
parser.add_argument('--format', choices=['json','csv','txt'], help='Export format')
parser.add_argument('--download-images', action='store_true', help='Download all images from page')
parser.add_argument('--output', help='Output file name base (without extension)')
parser.add_argument('--filter', help='Keyword to filter content')
parser.add_argument('--headless', action='store_true', help='Non-interactive mode')
parser.add_argument('--log', help='Path for terminal log output')
args = parser.parse_args()
if args.log:
sys.stdout = open(args.log, 'w', encoding='utf-8')
sys.stderr = sys.stdout
if args.url:
scrape_url(args.url, format=args.format, download_images=args.download_images,
output=args.output, filter=args.filter, headless=args.headless)
sys.exit()
elif args.search:
search_term(args.search)
sys.exit()
else:
main_menu()