import os
import re
import time
import json
import requests
import socket
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import random

# Optional imports for headless browser mode
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    SELENIUM_AVAILABLE = True
except ImportError:
    SELENIUM_AVAILABLE = False
    print("Selenium not installed. Headless browser mode will not be available.")
    print("To enable, install with: pip install selenium webdriver-manager")

class WebsiteAnalyzer:
    def __init__(self, use_headless_browser=False):
        self.output_dir = "site_analysis"
        self.seen_urls = set()
        self.prepare_directories()
        
        # Use headless browser if requested and available
        self.use_headless_browser = use_headless_browser and SELENIUM_AVAILABLE
        self.driver = None
        
        if self.use_headless_browser and SELENIUM_AVAILABLE:
            print("Initializing headless Chrome browser...")
            try:
                options = Options()
                options.add_argument("--headless")
                options.add_argument("--no-sandbox")
                options.add_argument("--disable-dev-shm-usage")
                options.add_argument("--window-size=1920,1080")
                options.add_argument("--disable-extensions")
                options.add_argument("--disable-gpu")
                
                # Add user agent
                options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
                
                # Initialize the Chrome driver
                self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
                print("Headless Chrome browser ready.")
            except Exception as e:
                print(f"Failed to initialize headless browser: {e}")
                self.use_headless_browser = False
        
        # Use Chrome-specific headers that match exactly what Chrome browser sends
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0',
            'Referer': 'https://www.google.com/'
        }
        # Set up a persistent session to maintain cookies
        self.session = requests.Session()
        self.session.headers.update(self.headers)
        
    def __del__(self):
        if self.driver:
            self.driver.quit()

    def prepare_directories(self):
        """Create necessary directories for output"""
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
        
        subdirs = ["images", "components", "content", "pages"]
        for subdir in subdirs:
            path = os.path.join(self.output_dir, subdir)
            if not os.path.exists(path):
                os.makedirs(path)
    
    def download_image(self, img_url, page_url):
        """Download an image and return local path"""
        try:
            # Skip data: URLs and other non-HTTP URLs
            if img_url.startswith('data:'):
                print(f"Skipping data URL image")
                return None
                
            # Handle relative URLs properly
            if not img_url.startswith(('http://', 'https://')):
                img_url = urljoin(page_url, img_url)
                
            print(f"Downloading image from: {img_url}")
            
            # Set a unique referer for each image download
            custom_headers = self.headers.copy()
            custom_headers['Referer'] = page_url
            
            response = self.session.get(img_url, stream=True, timeout=15, headers=custom_headers)
            
            if response.status_code == 200:
                # Check if response is actually an image
                content_type = response.headers.get('Content-Type', '')
                if not content_type.startswith('image/'):
                    print(f"Skipping non-image content type: {content_type}")
                    return None
                
                # Generate filename from URL
                parsed = urlparse(img_url)
                file_name = os.path.basename(parsed.path)
                
                # Check for valid filename
                if not file_name or '.' not in file_name:
                    # Try to determine extension from content type
                    extension = 'jpg'  # Default
                    if 'image/png' in content_type:
                        extension = 'png'
                    elif 'image/gif' in content_type:
                        extension = 'gif'
                    elif 'image/svg' in content_type:
                        extension = 'svg'
                    elif 'image/webp' in content_type:
                        extension = 'webp'
                        
                    file_name = f"image_{int(time.time()*1000)}.{extension}"
                
                # Make sure we have a unique filename
                base_name, ext = os.path.splitext(file_name)
                count = 0
                while os.path.exists(os.path.join(self.output_dir, "images", file_name)):
                    count += 1
                    file_name = f"{base_name}_{count}{ext}"
                
                # Save the image
                image_path = os.path.join(self.output_dir, "images", file_name)
                with open(image_path, "wb") as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                        
                print(f"Successfully saved image to {image_path}")
                return file_name
            else:
                print(f"Failed to download image. Status code: {response.status_code}")
        except requests.exceptions.RequestException as e:
            print(f"Request error downloading image {img_url}: {e}")
        except Exception as e:
            print(f"Error downloading image {img_url}: {e}")
        return None

    def detect_component_type(self, element):
        """Detect if element is header, nav, section, footer, etc."""
        if element.name == "header":
            return "header"
        elif element.name == "nav" or (element.name and "nav" in element.get("class", [])):
            return "nav"
        elif element.name == "footer":
            return "footer"
        elif element.name == "section":
            return "section"
        elif element.name == "main":
            return "main"
        elif element.name == "article":
            return "article"
        elif element.name == "div" and element.get("id") in ["header", "nav", "navigation", "footer"]:
            return element.get("id")
        elif element.name == "div" and element.find(["h1", "h2", "h3"]):
            return "section"
        return None

    def analyze_component(self, element, component_type):
        """Analyze a component to extract structure and content"""
        component_data = {
            "type": component_type,
            "classes": element.get("class", []),
            "id": element.get("id", ""),
            "content": []
        }
        
        # Extract headings
        headings = element.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        if headings:
            component_data["headings"] = [{"level": h.name, "text": h.get_text(strip=True)} for h in headings]
        
        # Extract paragraphs
        paragraphs = element.find_all("p")
        if paragraphs:
            component_data["paragraphs"] = [p.get_text(strip=True) for p in paragraphs]
            
        # Extract links
        links = element.find_all("a")
        if links:
            component_data["links"] = [
                {"text": a.get_text(strip=True), "href": a.get("href", "")} 
                for a in links if a.get_text(strip=True)
            ]
            
        # Extract images
        images = element.find_all("img")
        if images:
            component_data["images"] = []
            for img in images:
                src = img.get("src")
                alt = img.get("alt", "")
                if src:
                    component_data["images"].append({
                        "src": src,
                        "alt": alt,
                        "downloaded_path": None  # Will be filled later
                    })
                    
        # Extract background images from inline styles
        bg_images = []
        for tag in element.find_all(style=True):
            style = tag.get("style", "")
            bg_match = re.search(r'background-image:\s*url\([\'"]?(.*?)[\'"]?\)', style)
            if bg_match:
                bg_url = bg_match.group(1)
                bg_images.append({"src": bg_url, "element": tag.name, "downloaded_path": None})
        
        if bg_images:
            component_data["background_images"] = bg_images
            
        return component_data

    def get_page_content(self, url):
        """Get page content using either requests or headless browser"""
        if self.use_headless_browser and self.driver:
            try:
                print(f"Using headless browser to fetch: {url}")
                self.driver.get(url)
                # Wait for page to load
                time.sleep(3)
                return self.driver.page_source
            except Exception as e:
                print(f"Error with headless browser: {e}")
                print("Falling back to regular requests...")
                
        # Update the referrer before each request to seem more natural
        if len(self.seen_urls) > 1:
            last_url = list(self.seen_urls)[-2]
            self.session.headers.update({'Referer': last_url})
        
        # Add delay to mimic human browsing (0.5-2 seconds)
        delay = 0.5 + (1.5 * random.random())
        time.sleep(delay)
        
        # Use regular requests
        response = self.session.get(url, timeout=15, allow_redirects=True)
        response.raise_for_status()
        return response.text

    def analyze_page(self, url):
        """Analyze a single page and extract components"""
        normalized_url = self.normalize_url(url)
        
        if normalized_url in self.seen_urls:
            print(f"Skipping already analyzed URL: {url}")
            return None
            
        self.seen_urls.add(normalized_url)
        print(f"Analyzing page: {url}")
        
        try:
            print(f"Attempting to fetch {url}...")
            
            try:
                html_content = self.get_page_content(url)
                print(f"Successfully fetched {url}, parsing content...")
            except Exception as e:
                print(f"Failed to fetch {url}: {e}")
                return None
                
            soup = BeautifulSoup(html_content, "html.parser")
            
            # Save entire page inner text to content directory
            page_text = soup.get_text(separator="\n", strip=True)
            page_filename = self.get_page_filename(normalized_url).replace('.json', '.txt')
            text_path = os.path.join(self.output_dir, "content", page_filename)
            with open(text_path, "w", encoding="utf-8") as f:
                f.write(page_text)
            print(f"Saved page text to {text_path}")
            
            # Basic page info
            page_data = {
                "url": normalized_url,  # Store normalized URL in the data
                "original_url": url,    # Keep track of original URL
                "title": soup.title.get_text() if soup.title else "",
                "components": [],
                "links": [],
                "images": []  # Add a dedicated list for all images
            }
            
            # Extract all images from the page for direct downloading
            all_images = []
            for img in soup.find_all("img"):
                src = img.get("src")
                if src:
                    all_images.append({
                        "src": src,
                        "alt": img.get("alt", ""),
                        "downloaded_path": None
                    })
            
            # Add all found images to page data
            page_data["images"] = all_images
            
            # Extract main sections - focus on structure, not specific content
            main_components = []
            
            # Try to find major structural elements
            for element in soup.find_all(["header", "nav", "section", "footer", "main", "article", "div"]):
                component_type = self.detect_component_type(element)
                if component_type:
                    # Extract detailed component data using analyze_component
                    detailed_component = self.analyze_component(element, component_type)
                    main_components.append(detailed_component)
            
            # If no structure found, try to infer from positioning
            if not main_components:
                # Find the body or main container
                body = soup.body or soup
                
                # Extract major sections by position
                sections = self.extract_sections_by_position(body)
                main_components.extend(sections)
            
            page_data["components"] = main_components
            
            # Extract ALL links for further crawling - make sure we don't miss pages
            for a in soup.find_all("a", href=True):
                href = a.get("href")
                if href:
                    # Skip fragment identifiers, javascript, and mailto
                    if href.startswith('#') or href.startswith('javascript:') or href.startswith('mailto:'):
                        continue
                        
                    full_url = urljoin(url, href)
                    parsed_url = urlparse(full_url)
                    
                    # Only keep links to the same domain
                    if parsed_url.netloc == urlparse(url).netloc or self.normalize_url(parsed_url.netloc) == self.normalize_url(urlparse(url).netloc):
                        # Normalize URL - remove fragments, normalize path
                        normalized_url = self.normalize_url(full_url)
                            
                        # Add to links list if not already there
                        if normalized_url not in page_data["links"]:
                            page_data["links"].append(normalized_url)
            
            # Process and download images
            self.process_images(page_data, url)
            
            # Save the page analysis
            json_filename = self.get_page_filename(normalized_url)
            with open(os.path.join(self.output_dir, "pages", json_filename), "w") as f:
                json.dump(page_data, f, indent=2)
            
            print(f"Successfully analyzed {url}")    
            return page_data
            
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error for {url}: {e}")
            print("Please check if the URL is correct and the website is accessible.")
            return None
        except requests.exceptions.Timeout:
            print(f"Timeout error for {url}: The server took too long to respond.")
            return None
        except requests.exceptions.TooManyRedirects:
            print(f"Too many redirects for {url}: The request exceeded the maximum number of redirects.")
            return None
        except requests.exceptions.RequestException as e:
            print(f"Request error for {url}: {e}")
            return None
        except Exception as e:
            print(f"Error analyzing page {url}: {e}")
            return None
            
    def extract_sections_by_position(self, body):
        """Extract sections by their position in the document"""
        sections = []
        
        # First, try to split by major elements
        children = list(body.children)
        
        # If very few children, dig deeper
        if len([c for c in children if c.name]) < 3:
            children = list(body.find_all(recursive=False))
            
        # Process each container element
        for i, child in enumerate(children):
            if not child.name:  # Skip text nodes
                continue
                
            # Determine section type based on position
            if i == 0:
                section_type = "header"
            elif i == len(children) - 1:
                section_type = "footer"
            else:
                section_type = "section"
                
            section_data = {
                "type": section_type,
                "id": child.get("id", ""),
                "has_image": len(child.find_all("img")) > 0,
                "has_form": len(child.find_all("form")) > 0,
                "has_links": len(child.find_all("a")) > 0,
                "approximate_size": len(str(child))
            }
            sections.append(section_data)
            
        return sections

    def process_images(self, page_data, page_url):
        """Process and download all images in the page data"""
        # First, process the dedicated images list
        print(f"Processing {len(page_data.get('images', []))} images from page {page_url}")
        for img in page_data.get("images", []):
            if img["src"]:
                try:
                    full_url = urljoin(page_url, img["src"])
                    local_path = self.download_image(full_url, page_url)
                    if local_path:
                        img["downloaded_path"] = local_path
                        print(f"Downloaded image: {img['src']} -> {local_path}")
                    else:
                        print(f"Failed to download image: {img['src']}")
                except Exception as e:
                    print(f"Error processing image {img['src']}: {e}")
        
        # Then process images in components
        for component in page_data["components"]:
            # Process regular images
            if "images" in component:
                for img in component["images"]:
                    if img["src"]:
                        try:
                            full_url = urljoin(page_url, img["src"])
                            local_path = self.download_image(full_url, page_url)
                            if local_path:
                                img["downloaded_path"] = local_path
                                print(f"Downloaded component image: {img['src']} -> {local_path}")
                        except Exception as e:
                            print(f"Error processing component image {img['src']}: {e}")
            
            # Process background images
            if "background_images" in component:
                for bg_img in component["background_images"]:
                    if bg_img["src"]:
                        try:
                            full_url = urljoin(page_url, bg_img["src"])
                            local_path = self.download_image(full_url, page_url)
                            if local_path:
                                bg_img["downloaded_path"] = local_path
                                print(f"Downloaded background image: {bg_img['src']} -> {local_path}")
                        except Exception as e:
                            print(f"Error processing background image {bg_img['src']}: {e}")

    def get_page_filename(self, url):
        """Generate a filename for storing page data"""
        parsed = urlparse(url)
        path = parsed.path.strip("/")
        
        if not path:
            return "home.json"
        
        # Clean up the path for filename use
        safe_path = re.sub(r'[^\w\-]', '_', path)
        # Trim if too long
        if len(safe_path) > 100:
            safe_path = safe_path[:100]
        return f"{safe_path}.json"
    
    def generate_rebuild_context(self):
        """Generate a simplified context document for AI rebuilding with Tailwind"""
        all_pages = []
        pages_dir = os.path.join(self.output_dir, "pages")
        
        # Check if directory exists and has files
        if not os.path.exists(pages_dir) or not os.listdir(pages_dir):
            print(f"No pages found in {pages_dir}")
            return None
            
        # Load all page data
        for filename in os.listdir(pages_dir):
            if filename.endswith(".json"):
                try:
                    with open(os.path.join(pages_dir, filename), "r") as f:
                        all_pages.append(json.load(f))
                except Exception as e:
                    print(f"Error loading {filename}: {e}")
        
        if not all_pages:
            print("No pages data found to generate rebuild context")
            return None
        
        # Create a map of normalized URLs to page data
        normalized_url_map = {}
        for page in all_pages:
            normalized_url = self.normalize_url(page.get("url", ""))
            normalized_url_map[normalized_url] = page
            
        # Get site name from the home page or first page
        site_name = "Untitled Site"
        home_page = next((p for p in all_pages if p["url"].endswith("/") or "/index." in p["url"].lower()), None)
        if home_page and home_page["title"]:
            site_name = home_page["title"]
        elif all_pages and all_pages[0]["title"]:
            site_name = all_pages[0]["title"]
            
        # Get the base domain for the site
        base_domain = ""
        if all_pages:
            base_domain = self.normalize_domain(urlparse(all_pages[0]["url"]).netloc)
            
        # Create structure summary
        site_structure = {
            "site_name": site_name,
            "domain": base_domain,
            "num_pages": len(all_pages),
            "pages": []
        }
        
        # Track all unique URLs
        all_urls = set()
        analyzed_urls = set()
        
        # Extract common color schemes from page titles and URLs
        color_terms = ['blue', 'green', 'red', 'yellow', 'orange', 'purple', 'pink', 'black', 'white', 'gray', 'grey']
        site_colors = {}
        
        # Collect image statistics
        total_images = 0
        image_files = set()
        
        # Process each page
        for page in all_pages:
            url = page["url"]
            normalized_url = self.normalize_url(url)
            analyzed_urls.add(normalized_url)
            
            # Simplify URL for display
            display_url = urlparse(url).path
            if not display_url or display_url == "/":
                display_url = "Home Page"
            else:
                display_url = display_url.strip("/")
                
            # Count components by type
            component_count = {}
            for component in page["components"]:
                comp_type = component["type"]
                if comp_type in component_count:
                    component_count[comp_type] += 1
                else:
                    component_count[comp_type] = 1
            
            # Count images on the page
            page_images = 0
            # From dedicated images list
            for img in page.get("images", []):
                page_images += 1
                if img.get("downloaded_path"):
                    image_files.add(img["downloaded_path"])
                    
            # From components
            for component in page["components"]:
                if "images" in component:
                    for img in component["images"]:
                        page_images += 1
                        if img.get("downloaded_path"):
                            image_files.add(img["downloaded_path"])
                            
                if "background_images" in component:
                    for bg_img in component["background_images"]:
                        page_images += 1
                        if bg_img.get("downloaded_path"):
                            image_files.add(bg_img["downloaded_path"])
            
            total_images += page_images
            
            # Look for color terms in title and URL
            for color in color_terms:
                if color in page["title"].lower() or color in url.lower():
                    site_colors[color] = site_colors.get(color, 0) + 1
            
            page_summary = {
                "url": url,
                "display_url": display_url,
                "title": page["title"],
                "structure": component_count,
                "images_count": page_images
            }
            
            site_structure["pages"].append(page_summary)
            
            # Process links in the page
            for link in page.get("links", []):
                normalized_link = self.normalize_url(link)
                if normalized_link and not normalized_link.startswith(('javascript:', 'mailto:', 'tel:', 'data:')):
                    all_urls.add(normalized_link)
        
        # Create color scheme suggestion based on found colors
        color_scheme = []
        if site_colors:
            # Sort colors by frequency
            sorted_colors = sorted(site_colors.items(), key=lambda x: x[1], reverse=True)
            primary_color = sorted_colors[0][0]
            color_scheme.append(f"Primary color: {primary_color}")
            
            if len(sorted_colors) > 1:
                secondary_color = sorted_colors[1][0]
                color_scheme.append(f"Secondary color: {secondary_color}")
                
            if len(sorted_colors) > 2:
                accent_color = sorted_colors[2][0]
                color_scheme.append(f"Accent color: {accent_color}")
        
        # Create the rebuild context with enhanced information
        rebuild_context = {
            "site_structure": site_structure,
            "tailwind_rebuild_notes": {
                "general": "This site should be rebuilt using Tailwind CSS utility classes for responsive design",
                "components": {
                    "header": "Create a responsive header with Tailwind, using flex or grid for layout",
                    "nav": "Build navigation with Tailwind's utility classes for responsive design",
                    "section": "Use Tailwind's spacing, typography, and color utilities for content sections",
                    "footer": "Create a footer with Tailwind, consider using grid for complex layouts"
                },
                "color_scheme": color_scheme if color_scheme else ["Use neutral color scheme based on Tailwind defaults"],
                "responsive_design": "Ensure the site is fully responsive using Tailwind's breakpoint utilities",
                "accessibility": "Implement proper contrast ratios and ARIA attributes for accessibility"
            },
            "site_map": {
                "total_unique_urls": len(all_urls),
                "pages_analyzed": len(analyzed_urls),
                "percentage_coverage": 100 if not all_urls else round((len(analyzed_urls) / len(all_urls)) * 100, 2)
            },
            "assets": {
                "total_images_found": total_images,
                "unique_images_downloaded": len(image_files),
                "image_files": list(image_files)[:20] # List first 20 images to avoid making the file too large
            }
        }
        
        # Save rebuild context
        with open(os.path.join(self.output_dir, "rebuild_context.json"), "w") as f:
            json.dump(rebuild_context, f, indent=2)
            
        # Log any unanalyzed URLs for reference
        unanalyzed = all_urls - analyzed_urls
        if unanalyzed:
            print(f"\nNOTE: {len(unanalyzed)} URLs were found in links but not analyzed. This is likely due to:")
            print("1. URLs pointing to external domains")
            print("2. URLs that were filtered out (images, PDFs, etc.)")
            print("3. Links discovered in the final phase that weren't processed")
            
            # Only show the first few
            if len(unanalyzed) > 0:
                print("\nSample unanalyzed URLs:")
                for url in list(unanalyzed)[:3]:
                    print(f"- {url}")
            
        print(f"Rebuild context saved to {os.path.join(self.output_dir, 'rebuild_context.json')}")
        return rebuild_context

    def try_url_variants(self, url):
        """Try different URL variants (http/https) to see which one works"""
        parsed = urlparse(url)
        variants = []
        
        # Try both http and https
        if parsed.scheme == 'https':
            http_variant = 'http' + url[5:]
            variants = [url, http_variant]
        elif parsed.scheme == 'http':
            https_variant = 'https' + url[4:]
            variants = [url, https_variant]
        else:
            # No scheme, try both
            variants = [f"https://{url}", f"http://{url}"]
            
        # Try with and without www prefix if not present
        all_variants = []
        for variant in variants:
            parsed_variant = urlparse(variant)
            if parsed_variant.netloc.startswith('www.'):
                # Add non-www variant
                non_www = variant.replace(f"{parsed_variant.scheme}://www.", f"{parsed_variant.scheme}://")
                all_variants.extend([variant, non_www])
            else:
                # Add www variant
                www = variant.replace(f"{parsed_variant.scheme}://", f"{parsed_variant.scheme}://www.")
                all_variants.extend([variant, www])
                
        return list(set(all_variants))  # Remove duplicates

    def test_connectivity(self):
        """Test connectivity to various popular websites"""
        test_sites = [
            "https://example.com",
            "https://google.com",
            "https://mozilla.org",
            "https://python.org"
        ]
        
        print("\n=== CONNECTIVITY TEST ===")
        print("Testing connection to several public websites to diagnose network issues...")
        
        all_failed = True
        for site in test_sites:
            try:
                print(f"Testing connection to {site}...")
                response = self.session.get(site, timeout=5)
                if response.status_code == 200:
                    print(f"✓ Successfully connected to {site}")
                    all_failed = False
                else:
                    print(f"✗ Connected to {site} but received status code {response.status_code}")
            except Exception as e:
                print(f"✗ Failed to connect to {site}: {str(e)}")
                
        if all_failed:
            print("\nAll connectivity tests failed. This indicates a network issue with your server.")
            print("Your server may have restricted outbound connections or other network limitations.")
            return False
        else:
            print("\nSome connectivity tests succeeded, indicating your server can reach public websites.")
            return True
    
    def test_url_with_headless(self, url):
        """Test URL with headless browser as a last resort"""
        if not self.use_headless_browser or not self.driver:
            return False
            
        try:
            print(f"Attempting headless browser for: {url}")
            self.driver.get(url)
            time.sleep(3)
            if "Error" not in self.driver.title:
                return True
        except Exception as e:
            print(f"Headless browser error: {e}")
        return False
        
    def normalize_domain(self, domain):
        """Normalize a domain by removing www prefix"""
        if domain.startswith('www.'):
            return domain[4:]
        return domain
    
    def normalize_url(self, url):
        """Normalize a URL to avoid duplicates between www and non-www versions"""
        if not url:
            return ""
            
        # Handle data: URLs and other non-http URLs
        if url.startswith('data:') or url.startswith('mailto:') or url.startswith('tel:'):
            return url
            
        try:
            parsed = urlparse(url)
            
            # Normalize domain (remove www if present)
            netloc = parsed.netloc.lower()
            if netloc.startswith('www.'):
                netloc = netloc[4:]
                
            # Normalize path (remove trailing slash)
            path = parsed.path
            if path.endswith('/') and len(path) > 1:
                path = path[:-1]
                
            # Build normalized URL
            normalized = f"{parsed.scheme}://{netloc}{path}"
            if parsed.query:
                normalized += f"?{parsed.query}"
                
            return normalized
        except Exception as e:
            print(f"Error normalizing URL {url}: {e}")
            return url
        
    def domains_match(self, url1, url2):
        """Check if two URLs belong to the same domain (ignoring www)"""
        domain1 = self.normalize_domain(urlparse(url1).netloc)
        domain2 = self.normalize_domain(urlparse(url2).netloc)
        return domain1 == domain2

    def should_process_url(self, url):
        """Check if a URL should be processed (filter out images, PDFs, etc.)"""
        # Parse URL to get path
        parsed = urlparse(url)
        path = parsed.path.lower()
        
        # Skip common non-HTML file extensions
        skip_extensions = [
            '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp', '.ico',
            '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
            '.zip', '.rar', '.tar', '.gz', '.mp4', '.mp3', '.mov', '.avi',
            '.css', '.js', '.json', '.xml', '.rss', '.atom',
            '.ttf', '.otf', '.woff', '.woff2'
        ]
        
        for ext in skip_extensions:
            if path.endswith(ext):
                print(f"Skipping non-HTML file: {url}")
                return False
                
        # Also skip URLs with /wp-content/uploads/ as they're usually media files
        if '/wp-content/uploads/' in path:
            print(f"Skipping media file in uploads directory: {url}")
            return False
            
        # Skip wp-json API endpoints
        if '/wp-json/' in path:
            print(f"Skipping WordPress API endpoint: {url}")
            return False
            
        return True
        
    def crawl_website(self, start_url, max_pages=1000, force_test=False):
        """Crawl website starting from start_url"""
        
        # If we're not doing a forced test site, check connectivity first
        if not force_test and not self.test_connectivity():
            print("\nConnectivity test failed. Running with example.com as a test...")
            self.crawl_website("https://example.com", 2, force_test=True)
            return
            
        # Try different URL variants if not a forced test
        if not force_test:
            # Try different URL variants
            url_variants = self.try_url_variants(start_url)
            
            print(f"Trying different URL variants: {', '.join(url_variants)}")
            
            # Try each variant until one works
            working_url = None
            for variant in url_variants:
                try:
                    print(f"Testing URL variant: {variant}")
                    # Configure referrer depending on the URL
                    if 'www' in variant:
                        self.session.headers.update({'Referer': 'https://www.google.com/search?q=' + urlparse(variant).netloc})
                    else:
                        self.session.headers.update({'Referer': 'https://www.google.com/'})
                    
                    # Add delay to mimic human behavior
                    time.sleep(1)
                    
                    response = self.session.get(variant, timeout=15, allow_redirects=True)
                    if response.status_code == 200:
                        working_url = variant
                        print(f"Found working URL: {working_url}")
                        break
                except Exception as e:
                    print(f"Error testing {variant} with regular requests: {e}")
            
            # If all regular requests failed, try headless browser as last resort
            if not working_url and self.use_headless_browser:
                for variant in url_variants:
                    if self.test_url_with_headless(variant):
                        working_url = variant
                        print(f"Headless browser succeeded for: {working_url}")
                        break
            
            if not working_url:
                print("Could not find a working URL variant. Please check the domain and try again.")
                if not force_test:
                    print("\nFalling back to analyze example.com as a test...")
                    self.crawl_website("https://example.com", 2, force_test=True)
                return
                
            queue = [working_url]
        else:
            queue = [start_url]

        visited = set()  # URLs that have been processed (normalized)
        to_visit = set()  # All URLs that need to be visited (normalized)
        all_discovered = set()  # All discovered URLs across the site (normalized)
        url_mapping = {}  # Maps normalized URLs to original URLs for display
        pages_analyzed = 0
        
        # Add the initial URL to discovered URLs (normalized)
        normalized_start = self.normalize_url(queue[0])
        all_discovered.add(normalized_start)
        to_visit.add(normalized_start)
        url_mapping[normalized_start] = queue[0]
        
        print(f"\nStarting crawler with NO page limit - will analyze ALL discovered URLs...")

        # -----------------------------------------------------
        # First phase: discover as many URLs as possible
        # -----------------------------------------------------
        print("\n=== PHASE 1: URL DISCOVERY ===")
        discovery_limit = 10 if force_test else min(max_pages, 50)  # Increased discovery phase limit
        discovery_count = 0
        
        while queue and discovery_count < discovery_limit:
            current_url = queue.pop(0)
            normalized_current = self.normalize_url(current_url)
            
            # Skip already visited URLs
            if normalized_current in visited:
                continue
                
            # Skip non-HTML content
            if not self.should_process_url(current_url):
                visited.add(normalized_current)
                if normalized_current in to_visit:
                    to_visit.remove(normalized_current)
                continue
                
            visited.add(normalized_current)
            if normalized_current in to_visit:
                to_visit.remove(normalized_current)
            
            page_data = self.analyze_page(current_url)
            discovery_count += 1
            
            if page_data:
                pages_analyzed += 1
                
                # Also mark the URL with www or without www as visited to avoid duplication
                alternate_url = self.get_alternate_domain_url(current_url)
                if alternate_url:
                    normalized_alt = self.normalize_url(alternate_url)
                    visited.add(normalized_alt)
                    if normalized_alt in to_visit:
                        to_visit.remove(normalized_alt)
                        
                print(f"Discovery phase: {discovery_count}/{discovery_limit} pages processed. Found {len(all_discovered)} URLs so far.")
                
                # Add ALL links to discovered_urls and to_visit
                for link in page_data["links"]:
                    # Normalize the URL for consistency
                    normalized_url = self.normalize_url(link)
                    
                    # Skip URLs we've already seen
                    if normalized_url in all_discovered:
                        continue
                    
                    # Skip non-HTML content
                    if not self.should_process_url(link):
                        continue
                        
                    all_discovered.add(normalized_url)
                    to_visit.add(normalized_url)
                    url_mapping[normalized_url] = link
                    
                    # Add to queue if we're still in discovery phase
                    if discovery_count < discovery_limit:
                        # Prioritize same host links
                        if self.domains_match(normalized_url, current_url):
                            queue.insert(0, link)  # Add to front of queue for breadth-first on same host
                        else:
                            queue.append(link)
        
        # -----------------------------------------------------
        # Second phase: analyze all remaining URLs
        # -----------------------------------------------------
        print(f"\n=== PHASE 2: ANALYZING ALL DISCOVERED URLs ===")
        remaining = list(to_visit)
        print(f"Found {len(all_discovered)} total URLs, {len(remaining)} remain to be analyzed")
        
        # Sort by domain first (prioritize same domain)
        base_domain = self.normalize_domain(urlparse(start_url).netloc)
        remaining.sort(key=lambda url: 0 if self.normalize_domain(urlparse(url_mapping.get(url, url)).netloc) == base_domain else 1)
        
        # Process all remaining URLs - NO limit on how many we analyze
        # We want to process ALL URLs for a complete site analysis
        for i, normalized_url in enumerate(remaining):
            if force_test and i >= 5:  # Only limit for test runs
                break
                
            if normalized_url not in visited:
                # Get the original URL format
                original_url = url_mapping.get(normalized_url, normalized_url)
                
                # Skip non-HTML content
                if not self.should_process_url(original_url):
                    visited.add(normalized_url)
                    continue
                    
                visited.add(normalized_url)
                
                print(f"Analyzing URL {i+1}/{len(remaining)}: {original_url}")
                page_data = self.analyze_page(original_url)
                
                if page_data:
                    pages_analyzed += 1
                    
                    # Also mark the URL with www or without www as visited to avoid duplication
                    alternate_url = self.get_alternate_domain_url(original_url)
                    if alternate_url:
                        normalized_alt = self.normalize_url(alternate_url)
                        visited.add(normalized_alt)
                        
                    # Add any new links found
                    for link in page_data.get("links", []):
                        normalized_link = self.normalize_url(link)
                        if normalized_link not in all_discovered and self.should_process_url(link):
                            all_discovered.add(normalized_link)
                            url_mapping[normalized_link] = link
                            
                            # If we haven't visited it yet, add it to the end of the remaining list
                            if normalized_link not in visited and normalized_link not in to_visit:
                                to_visit.add(normalized_link)
                                remaining.append(normalized_link)
        
        # -----------------------------------------------------
        # Final check for any missed URLs
        # -----------------------------------------------------
        # Check if we still have unanalyzed URLs
        missed_urls = to_visit - visited
        if missed_urls and not force_test:
            print(f"\n=== PHASE 3: FINAL SWEEP ===")
            print(f"Found {len(missed_urls)} URLs that were missed. Analyzing them now...")
            
            for i, normalized_url in enumerate(missed_urls):
                if normalized_url in visited:
                    continue
                    
                original_url = url_mapping.get(normalized_url, normalized_url)
                
                if not self.should_process_url(original_url):
                    visited.add(normalized_url)
                    continue
                    
                visited.add(normalized_url)
                
                print(f"Final sweep: Analyzing URL {i+1}/{len(missed_urls)}: {original_url}")
                page_data = self.analyze_page(original_url)
                
                if page_data:
                    pages_analyzed += 1
        
        # Make sure we update the page counts in the site map
        print(f"\nTotal URLs discovered: {len(all_discovered)}")
        print(f"Total pages analyzed: {pages_analyzed}")
        
        # Save all discovered URLs for reference
        with open(os.path.join(self.output_dir, "all_discovered_urls.txt"), "w") as f:
            for normalized_url in sorted(all_discovered):
                visited_status = "✓" if normalized_url in visited else "✗"
                original_url = url_mapping.get(normalized_url, normalized_url)
                f.write(f"{visited_status} {original_url} (normalized: {normalized_url})\n")
        
        if pages_analyzed > 0:
            # Generate rebuild context after crawl
            self.generate_rebuild_context()
            
            print(f"Analysis complete. Analyzed {pages_analyzed} pages.")
            print(f"Check the '{self.output_dir}' directory for results.")
            print(f"Content text files are in '{os.path.join(self.output_dir, 'content')}'")
            print(f"Full URL list saved to '{os.path.join(self.output_dir, 'all_discovered_urls.txt')}'")
            print(f"Use the rebuild_context.json file to prompt an AI to rebuild the site with Tailwind CSS.")
        else:
            print("No pages were successfully analyzed. Please check the URL and try again.")
            
        if force_test:
            print("\nThis was a test run with example.com")
            print("If this worked, your script is working correctly but the original site might be unavailable")
            
    def get_alternate_domain_url(self, url):
        """Get the alternate domain version of a URL (with/without www)"""
        parsed = urlparse(url)
        netloc = parsed.netloc
        
        # If has www, return version without www
        if netloc.startswith('www.'):
            return url.replace(f"{parsed.scheme}://www.", f"{parsed.scheme}://")
        # If doesn't have www, return version with www
        else:
            return url.replace(f"{parsed.scheme}://", f"{parsed.scheme}://www.")

if __name__ == "__main__":
    print("""
╔════════════════════════════════════════════════════════╗
║ Tailwind Rebuilder - Website Analyzer                  ║
║ Analyzes website structure for Tailwind CSS rebuilding ║
╚════════════════════════════════════════════════════════╝
    """)
    
    # Test internet connection
    try:
        socket.create_connection(("www.google.com", 80))
        print("✓ Internet connection test: Success!")
    except OSError:
        print("⚠ WARNING: Internet connection test failed. You may have connectivity issues.")
        
    print("\nOptions:")
    print("1. Analyze your website (standard mode)")
    print("2. Analyze your website with headless browser (for JavaScript-heavy sites)")
    print("3. Test with example.com")
    
    choice = input("\nEnter your choice (1, 2, or 3): ").strip()
    
    if choice == "3":
        print("\nRunning test with example.com...")
        analyzer = WebsiteAnalyzer(use_headless_browser=False)
        analyzer.crawl_website("https://example.com", 2, force_test=True)
    else:
        use_headless = (choice == "2")
        if use_headless:
            print("\nUsing headless browser mode for JavaScript rendering...")
            if not SELENIUM_AVAILABLE:
                print("Selenium not available. Install with: pip install selenium webdriver-manager")
                exit(1)
                
        analyzer = WebsiteAnalyzer(use_headless_browser=use_headless)
        
        website_url = input("\nEnter the website URL to analyze (with or without http/https): ").strip()
        
        if not website_url:
            print("No URL entered. Exiting.")
            exit(1)
        
        # Ensure URL has proper scheme
        if not website_url.startswith(('http://', 'https://')):
            website_url = 'https://' + website_url
            print(f"Added https:// prefix. Using: {website_url}")
        
        max_discovery = input("Enter maximum initial discovery pages (default: 20) - this will find most URLs: ").strip()
        max_discovery = int(max_discovery) if max_discovery and max_discovery.isdigit() else 20
        
        print(f"\nStarting analysis of {website_url}")
        print("This script will:")
        print("1. Discover URLs in an initial crawl phase")
        print("2. Then analyze ALL discovered URLs")
        print("3. Save all content and structure for Tailwind rebuilding")
        
        analyzer.crawl_website(website_url, max_discovery) 