#!/usr/bin/env python3
# crawl_urls.py — parse Apache directory index

import urllib.request
import urllib.parse
import re
import time

BASE = "https://vmware.digiboy.ir"
EXTENSIONS = ('.iso','.ova','.zip','.bundle','.exe','.dmg',
              '.tar','.pak','.spl','.img','.msi','.rar','.7z')

DIRS = [
    "/",
    "/6.X/",
    "/7.X/",
    "/8.X/",
    "/9.X/",
    "/9.X/vCF/",
    "/Omnissa%20Horizon/",
    "/VMware%20Remote%20Console/",
    "/VMware%20Worksation/",
    "/VMwareTools/",
]

visited = set()
file_urls = []

def crawl(path):
    if path in visited:
        return
    visited.add(path)

    url = BASE + path
    print(f"Crawling: {url}")
    try:
        req = urllib.request.Request(url, headers={
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'
        })
        with urllib.request.urlopen(req, timeout=30) as resp:
            html = resp.read().decode('utf-8', errors='ignore')
    except Exception as e:
        print(f"  [ERROR] {e}")
        return

    hrefs = re.findall(r'href="([^"]+)"', html)
    for href in hrefs:
        if href.startswith(('?', '/', 'http', '#', '..')):
            continue
        decoded = urllib.parse.unquote(href)
        if href.endswith('/'):
            # Thư mục con
            sub = path + href
            if sub not in visited:
                DIRS.append(sub)
        elif decoded.lower().endswith(EXTENSIONS):
            full_url = BASE + path + href
            file_urls.append(full_url)
            print(f"  + {decoded}")

    time.sleep(0.5)  # lịch sự với server

for d in DIRS:
    crawl(d)

with open("urls.txt", "w") as f:
    f.write("\n".join(file_urls))

print(f"\n{'='*40}")
print(f"Tổng file: {len(file_urls)}")
print(f"Đã lưu vào: urls.txt")
