import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup

# Global sets to reduce duplication
seen_css_lines = set()
seen_forms = set()
unique_urls = set()  # To store all unique URLs encountered

# Ensure directories exist
def prepare_directories():
    if not os.path.exists("images"):
        os.makedirs("images")
    if not os.path.exists("content"):
        os.makedirs("content")
    if not os.path.exists("config"):
        os.makedirs("config")
    # Initialize config files
    open(os.path.join("config", "image-map.txt"), "w").close()
    open(os.path.join("config", "forms.txt"), "w").close()
    open(os.path.join("config", "design-over.css"), "w").close()
    open(os.path.join("config", "urls.txt"), "w").close()

# Check if the URL is on the same domain
def same_domain(base, target):
    return urlparse(base).netloc == urlparse(target).netloc

# Save page inner text to file (in content directory)
def process_text(soup, page_url):
    text = soup.get_text(separator="\n", strip=True)
    parsed = urlparse(page_url)
    # Determine filename based on path
    if parsed.path in ("/", "", None):
        filename = "home-content.txt"
    else:
        safe_path = parsed.path.strip("/").replace("/", "-")
        if not safe_path:
            safe_path = "page"
        filename = f"{safe_path}-content.txt"
    filepath = os.path.join("content", filename)
    try:
        with open(filepath, "w", encoding="utf-8") as ftext:
            ftext.write(text)
    except Exception as e:
        print(f"Error writing text for {page_url}: {e}")

# Process images: download and log mapping (in config directory)
def process_images(soup, page_url):
    for img in soup.find_all("img"):
        src = img.get("src")
        if not src:
            continue
        full_url = urljoin(page_url, src)
        try:
            response = requests.get(full_url, stream=True, timeout=10)
            if response.status_code == 200:
                parsed = urlparse(full_url)
                file_name = os.path.basename(parsed.path)
                if not file_name:
                    file_name = f"image_{int(time.time()*1000)}.jpg"
                image_path = os.path.join("images", file_name)
                with open(image_path, "wb") as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                with open(os.path.join("config", "image-map.txt"), "a") as fmap:
                    fmap.write(f"{page_url} -> {full_url} saved as {file_name}\n")
        except Exception as e:
            print(f"Error downloading image {full_url}: {e}")

# Process forms: log form details, avoiding duplicates
def process_forms(soup, page_url):
    forms = soup.find_all("form")
    if forms:
        with open(os.path.join("config", "forms.txt"), "a") as fform:
            for form in forms:
                action = form.get("action")
                method = form.get("method", "GET").upper()
                # Create a signature string for deduplication
                inputs = []
                for tag in form.find_all(["input", "textarea", "select"]):
                    tag_name = tag.name
                    tag_type = tag.get("type", "")
                    tag_field = tag.get("name", "N/A")
                    inputs.append(f"{tag_name}:{tag_field}:{tag_type}")
                signature = f"{action}|{method}|{'|'.join(sorted(inputs))}"
                if signature in seen_forms:
                    continue
                seen_forms.add(signature)
                fform.write(f"Page: {page_url}\n")
                fform.write(f"Form: action={action}, method={method}\n")
                for item in inputs:
                    fform.write(f"    {item}\n")
                fform.write("\n")

# Process CSS: extract inline and external CSS, deduplicate and store only font-face and color information
def process_css(soup, page_url):
    css_content = ""
    # Inline <style> blocks
    for style in soup.find_all("style"):
        css_content += style.get_text() + "\n"
    # External CSS files
    for link in soup.find_all("link", rel="stylesheet"):
        href = link.get("href")
        if href:
            full_css_url = urljoin(page_url, href)
            try:
                r = requests.get(full_css_url, timeout=10)
                if r.status_code == 200:
                    css_content += r.text + "\n"
            except Exception as e:
                print(f"Error fetching CSS {full_css_url}: {e}")
    # Filter to keep only @font-face, font-family, or color lines
    new_lines = []
    for line in css_content.splitlines():
        if re.search(r'(@font-face|font-family|color\s*:)', line, re.IGNORECASE):
            clean_line = line.strip()
            if clean_line and clean_line not in seen_css_lines:
                seen_css_lines.add(clean_line)
                new_lines.append(clean_line)
    if new_lines:
        with open(os.path.join("config", "design-over.css"), "a") as fcss:
            fcss.write("\n".join(new_lines) + "\n")

# Crawl the site (simple breadth-first crawler within same domain)
def crawl_site(start_url, max_pages=50):
    visited = set()
    queue = [start_url]
    unique_urls.add(start_url)
    while queue and len(visited) < max_pages:
        current_url = queue.pop(0)
        if current_url in visited:
            continue
        visited.add(current_url)
        try:
            print(f"Processing: {current_url}")
            response = requests.get(current_url, timeout=10)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, "html.parser")
            # Process the page content, images, forms, and CSS
            process_text(soup, current_url)
            process_images(soup, current_url)
            process_forms(soup, current_url)
            process_css(soup, current_url)
            # Queue new links within the same domain
            for link in soup.find_all("a", href=True):
                next_url = urljoin(current_url, link["href"])
                if same_domain(start_url, next_url):
                    # Add to unique URLs set
                    unique_urls.add(next_url)
                    if next_url not in visited:
                        queue.append(next_url)
        except Exception as e:
            print(f"Error processing page {current_url}: {e}")

# After crawling, write the unique URLs to config/urls.txt
def write_unique_urls():
    try:
        with open(os.path.join("config", "urls.txt"), "w", encoding="utf-8") as f:
            for url in sorted(unique_urls):
                f.write(url + "\n")
    except Exception as e:
        print(f"Error writing unique URLs: {e}")

if __name__ == "__main__":
    prepare_directories()
    start_url = input("Enter the website URL to crawl: ").strip()
    if not start_url:
        print("No URL entered. Exiting.")
        exit(1)
    crawl_site(start_url)
    write_unique_urls()
    print("Crawling complete. Check the 'content' and 'config' directories and the 'images' folder for output.")