aetherwiki/dot.py at main · a11ce/aetherwiki

99 lines (84 loc) · 2.82 KB
#!/usr/bin/env python3
NOTE: This file was generated by GPT-4o.
Generate a DOT graph of HTML file link structure.
    html_to_dot.py [-d DIRECTORY] [-o OUTPUT]
Requirements:
    pip install beautifulsoup4 networkx pydot
print("graph [ overlap=false ];\nnode [shape=record,height=1];")
import argparse
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import networkx as nx
def find_html_files(directory):
    Walk through `directory`, collecting all .html/.htm files.
    Returns a dict mapping relative path -> absolute path.
    html_files = {}
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith(('.html', '.htm')):
                rel = os.path.relpath(os.path.join(root, file), directory)
                html_files[rel] = os.path.join(root, file)
    return html_files
def extract_links(file_path):
    Parse HTML file and extract internal links to other .html/.htm files.
    Returns a set of normalized relative paths.
    links = set()
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
    for a in soup.find_all('a', href=True):
        href = a['href']
        parsed = urlparse(href)
        # Skip external or fragment-only links
        if parsed.scheme or parsed.netloc:
            continue
        path = parsed.path
        if path.lower().endswith(('.html', '.htm')):
            norm = os.path.normpath(path)
            links.add(norm)
    return links
def build_graph(html_files):
    Build a directed graph where nodes are HTML files and edges
    represent links between them.
    G = nx.DiGraph()
    # Add all files as nodes
    for node in html_files:
        G.add_node(node)
    # Extract and add edges
    for node, full_path in html_files.items():
        targets = extract_links(full_path)
        for tgt in targets:
            if tgt in html_files:
                G.add_edge(node, tgt)
    return G
def main():
    parser = argparse.ArgumentParser(
        description='Generate DOT graph of HTML links.')
    parser.add_argument('-d',
                        '--directory',
                        default='.',
                        help='Directory to scan for HTML files.')
    parser.add_argument('-o',
                        '--output',
                        default='graph.dot',
                        help='Path to output DOT file.')
    args = parser.parse_args()
    html_files = find_html_files(args.directory)
    if not html_files:
        print(f"No HTML files found in {args.directory}")
        return
    G = build_graph(html_files)
    # Write to DOT format using pydot via networkx
    nx.drawing.nx_pydot.write_dot(G, args.output)
    print(f"DOT graph written to {args.output}")
if __name__ == '__main__':
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

dot.py

Latest commit

History

dot.py

File metadata and controls