#!/usr/bin/env python3
"""
Tailwind Site Analyzer
----------------------
A tool to analyze websites and prepare content for rebuilding with Tailwind CSS.
This improved version focuses on accurately capturing all pages and ensuring
they are properly included in the rebuild context.
"""

import os
import re
import time
import json
import random
import socket
import requests
from urllib.parse import urljoin, urlparse, urlunparse
from bs4 import BeautifulSoup
from collections import defaultdict

# Optional imports for headless browser mode
try:
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    SELENIUM_AVAILABLE = True
except ImportError:
    SELENIUM_AVAILABLE = False
    print("Selenium not installed. Headless browser mode will not be available.")
    print("To enable, install with: pip install selenium webdriver-manager")

class UrlManager:
    """Manages URLs, normalization, and tracking of visited/unvisited URLs"""
    
    def __init__(self, base_url):
        self.base_url = self.normalize_url(base_url)
        self.base_domain = self.get_domain(base_url)
        self.discovered_urls = set()  # All URLs we've found
        self.visited_urls = set()     # URLs we've processed
        self.queued_urls = []         # URLs queued for processing
        self.url_mapping = {}         # Maps normalized URLs to original form
        self.pages_data = {}          # Maps normalized URLs to page data
        
    def get_domain(self, url):
        """Extract and normalize domain from URL"""
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    
    def domains_match(self, url1, url2):
        """Check if two URLs belong to the same domain"""
        return self.get_domain(url1) == self.get_domain(url2)
    
    def normalize_url(self, url):
        """Normalize URL to avoid duplicates"""
        if not url:
            return ""
        
        # Handle non-http URLs
        if url.startswith(('data:', 'mailto:', 'tel:', 'javascript:')):
            return url
        
        try:
            parsed = urlparse(url)
            
            # Use lowercase
            scheme = parsed.scheme.lower() or 'http'
            netloc = parsed.netloc.lower()
            
            # Remove 'www.' prefix
            if netloc.startswith('www.'):
                netloc = netloc[4:]
                
            # Remove trailing slash from path
            path = parsed.path
            if path.endswith('/') and len(path) > 1:
                path = path[:-1]
                
            # Handle empty paths
            if not path:
                path = '/'
                
            # Build normalized URL
            normalized = urlunparse((scheme, netloc, path, parsed.params, parsed.query, ''))
            return normalized
            
        except Exception as e:
            print(f"Error normalizing URL {url}: {e}")
            return url
    
    def add_url(self, url, referring_url=None):
        """Add URL to be crawled if it qualifies"""
        # Skip non-HTTP URLs
        if not url or not url.startswith(('http://', 'https://')):
            if not url.startswith(('mailto:', 'tel:', 'javascript:', 'data:', '#')):
                # Try to resolve relative URL
                if referring_url:
                    url = urljoin(referring_url, url)
                else:
                    return False
            else:
                return False
                
        # Normalize URL
        normalized_url = self.normalize_url(url)
        
        # Skip if already discovered
        if normalized_url in self.discovered_urls:
            return False
            
        # Check if domain matches base domain
        if not self.domains_match(normalized_url, self.base_url):
            return False
            
        # Skip non-HTML content
        if not self.is_html_url(url):
            return False
            
        # Add to tracking collections
        self.discovered_urls.add(normalized_url)
        self.url_mapping[normalized_url] = url
        self.queued_urls.append(normalized_url)
        return True
        
    def is_html_url(self, url):
        """Check if URL likely points to HTML content based on extension"""
        parsed = urlparse(url)
        path = parsed.path.lower()
        
        # Skip common non-HTML extensions
        skip_extensions = [
            '.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.bmp', '.ico',
            '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
            '.zip', '.rar', '.tar', '.gz', '.mp4', '.mp3', '.mov', '.avi',
            '.css', '.js', '.json', '.xml', '.rss', '.atom',
            '.ttf', '.otf', '.woff', '.woff2'
        ]
        
        for ext in skip_extensions:
            if path.endswith(ext):
                return False
                
        # Skip media in uploads directory (common in WordPress)
        if '/wp-content/uploads/' in path:
            return False
            
        # Skip API endpoints
        if '/wp-json/' in path or '/api/' in path:
            return False
            
        return True
        
    def get_next_url(self):
        """Get next URL to process"""
        if not self.queued_urls:
            return None
            
        # Get and remove URL from queue
        normalized_url = self.queued_urls.pop(0)
        original_url = self.url_mapping.get(normalized_url, normalized_url)
        
        # Mark as visited
        self.visited_urls.add(normalized_url)
        
        return original_url
        
    def get_stats(self):
        """Get statistics about URLs"""
        return {
            "discovered": len(self.discovered_urls),
            "visited": len(self.visited_urls),
            "queued": len(self.queued_urls)
        }
        
    def get_all_pages(self):
        """Get data for all pages"""
        return self.pages_data
        
    def add_page_data(self, normalized_url, page_data):
        """Add page data for a URL"""
        self.pages_data[normalized_url] = page_data


class ContentManager:
    """Manages content extraction and storage"""
    
    def __init__(self, output_dir="site_analysis"):
        self.output_dir = output_dir
        self.images = {}  # Maps image URL to local path
        self.components = {}  # Maps component IDs to component data
        self.prepare_directories()
        
    def prepare_directories(self):
        """Create necessary output directories"""
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
            
        subdirs = ["images", "components", "content", "pages"]
        for subdir in subdirs:
            path = os.path.join(self.output_dir, subdir)
            if not os.path.exists(path):
                os.makedirs(path)
                
    def save_page_content(self, url, html_content, page_data):
        """Save page content to files"""
        parsed = urlparse(url)
        path = parsed.path.strip("/")
        
        # Generate filename
        if not path:
            filename = "home"
        else:
            filename = path.replace('/', '_')
            if len(filename) > 100:
                filename = filename[:100]
                
        # Save HTML content
        html_path = os.path.join(self.output_dir, "content", f"{filename}.html")
        with open(html_path, "w", encoding="utf-8") as f:
            f.write(html_content)
            
        # Extract and save text content
        soup = BeautifulSoup(html_content, "html.parser")
        text_content = soup.get_text(separator="\n", strip=True)
        text_path = os.path.join(self.output_dir, "content", f"{filename}.txt")
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(text_content)
            
        # Save page data
        json_path = os.path.join(self.output_dir, "pages", f"{filename}.json")
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump(page_data, f, indent=2)
            
        return {
            "html_path": html_path,
            "text_path": text_path,
            "json_path": json_path
        }
        
    def download_image(self, img_url, referer_url, session):
        """Download image and return local path"""
        # Skip if already downloaded
        if img_url in self.images:
            return self.images[img_url]
            
        try:
            # Skip data URLs
            if img_url.startswith('data:'):
                return None
                
            # Prepare headers with referer
            headers = session.headers.copy()
            headers['Referer'] = referer_url
            
            # Download image
            response = session.get(img_url, stream=True, timeout=10, headers=headers)
            
            if response.status_code == 200:
                # Verify it's an image
                content_type = response.headers.get('Content-Type', '')
                if not content_type.startswith('image/'):
                    return None
                    
                # Generate filename
                parsed = urlparse(img_url)
                filename = os.path.basename(parsed.path)
                
                # Add extension if missing
                if not filename or '.' not in filename:
                    ext = 'jpg'  # Default
                    if 'image/png' in content_type:
                        ext = 'png'
                    elif 'image/gif' in content_type:
                        ext = 'gif'
                    elif 'image/svg' in content_type:
                        ext = 'svg'
                    elif 'image/webp' in content_type:
                        ext = 'webp'
                    filename = f"img_{int(time.time()*1000)}.{ext}"
                
                # Ensure unique filename
                base, ext = os.path.splitext(filename)
                counter = 0
                while os.path.exists(os.path.join(self.output_dir, "images", filename)):
                    counter += 1
                    filename = f"{base}_{counter}{ext}"
                    
                # Save image
                img_path = os.path.join(self.output_dir, "images", filename)
                with open(img_path, "wb") as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                        
                # Store in cache
                self.images[img_url] = filename
                return filename
                
        except Exception as e:
            print(f"Error downloading image {img_url}: {e}")
            
        return None
        
    def extract_component(self, element, component_type):
        """Extract structured data from a component"""
        component_data = {
            "type": component_type,
            "id": element.get("id", ""),
            "classes": element.get("class", []),
            "content": {}
        }
        
        # Extract headings
        headings = element.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
        if headings:
            component_data["content"]["headings"] = [
                {"level": h.name, "text": h.get_text(strip=True)}
                for h in headings
            ]
            
        # Extract paragraphs
        paragraphs = element.find_all("p")
        if paragraphs:
            component_data["content"]["paragraphs"] = [
                p.get_text(strip=True) for p in paragraphs
            ]
            
        # Extract links
        links = element.find_all("a", href=True)
        if links:
            component_data["content"]["links"] = [
                {"text": a.get_text(strip=True), "href": a.get("href", "")}
                for a in links if a.get_text(strip=True)
            ]
            
        # Extract images
        images = element.find_all("img")
        if images:
            component_data["content"]["images"] = [
                {"src": img.get("src", ""), "alt": img.get("alt", "")}
                for img in images if img.get("src")
            ]
            
        # Extract background images from styles
        bg_images = []
        for tag in element.find_all(style=True):
            style = tag.get("style", "")
            bg_match = re.search(r'background-image:\s*url\([\'"]?(.*?)[\'"]?\)', style)
            if bg_match:
                bg_url = bg_match.group(1)
                bg_images.append({"src": bg_url, "element": tag.name})
        
        if bg_images:
            component_data["content"]["background_images"] = bg_images
            
        return component_data
        
    def detect_component_type(self, element):
        """Detect component type from element"""
        if element.name == "header":
            return "header"
        elif element.name == "nav" or (element.name and "nav" in element.get("class", [])):
            return "nav"
        elif element.name == "footer":
            return "footer"
        elif element.name == "section":
            return "section"
        elif element.name == "main":
            return "main"
        elif element.name == "article":
            return "article"
        elif element.name == "aside":
            return "sidebar"
        elif element.name == "div" and element.get("id") in ["header", "nav", "navigation", "footer", "main", "content"]:
            return element.get("id")
        elif element.name == "div" and any(c in element.get("class", []) for c in ["header", "nav", "navigation", "footer", "main", "content"]):
            for c in element.get("class", []):
                if c in ["header", "nav", "navigation", "footer", "main", "content"]:
                    return c
        elif element.name == "div" and element.find(["h1", "h2", "h3"]):
            return "section"
        return None


class WebsiteAnalyzer:
    """Main class for analyzing websites for Tailwind rebuilding"""
    
    def __init__(self, use_headless=False):
        self.use_headless = use_headless and SELENIUM_AVAILABLE
        self.driver = None
        self.session = requests.Session()
        self.setup_session()
        
        if self.use_headless:
            self.setup_headless_browser()
    
    def setup_session(self):
        """Set up the requests session with browser-like headers"""
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'Cache-Control': 'max-age=0'
        }
        self.session.headers.update(headers)
    
    def setup_headless_browser(self):
        """Set up headless browser if available"""
        if not SELENIUM_AVAILABLE:
            return
            
        try:
            options = Options()
            options.add_argument("--headless")
            options.add_argument("--no-sandbox")
            options.add_argument("--disable-dev-shm-usage")
            options.add_argument("--window-size=1920,1080")
            options.add_argument("--disable-extensions")
            options.add_argument("--disable-gpu")
            
            # Add user agent
            options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
            
            self.driver = webdriver.Chrome(
                service=Service(ChromeDriverManager().install()),
                options=options
            )
            print("Headless browser ready.")
        except Exception as e:
            print(f"Failed to initialize headless browser: {e}")
            self.use_headless = False
    
    def __del__(self):
        """Clean up resources"""
        if self.driver:
            self.driver.quit()
    
    def test_connectivity(self):
        """Test basic internet connectivity"""
        test_sites = [
            "https://example.com",
            "https://google.com",
            "https://mozilla.org"
        ]
        
        print("Testing connectivity...")
        for site in test_sites:
            try:
                response = self.session.get(site, timeout=5)
                if response.status_code == 200:
                    print(f"√ Successfully connected to {site}")
                    return True
            except Exception as e:
                print(f"✗ Failed to connect to {site}: {e}")
                
        print("All connectivity tests failed. Check your internet connection.")
        return False
    
    def get_page_content(self, url):
        """Get page content using either requests or headless browser"""
        if self.use_headless and self.driver:
            try:
                print(f"Using headless browser to fetch: {url}")
                self.driver.get(url)
                time.sleep(3)  # Wait for page to load
                return self.driver.page_source
            except Exception as e:
                print(f"Headless browser error: {e}")
                print("Falling back to regular requests...")
        
        # Add jitter to seem more human-like
        time.sleep(random.uniform(0.5, 2.0))
        
        # Try to get page with requests
        response = self.session.get(url, timeout=15, allow_redirects=True)
        response.raise_for_status()
        return response.text
    
    def analyze_page(self, url, url_manager, content_manager):
        """Analyze a single page and extract its structure"""
        normalized_url = url_manager.normalize_url(url)
        
        try:
            print(f"Analyzing page: {url}")
            
            # Get page content
            html_content = self.get_page_content(url)
            soup = BeautifulSoup(html_content, "html.parser")
            
            # Basic page data
            page_data = {
                "url": url,
                "normalized_url": normalized_url,
                "title": soup.title.get_text() if soup.title else "",
                "components": [],
                "links": [],
                "images": []
            }
            
            # Extract all links
            for a in soup.find_all("a", href=True):
                href = a.get("href")
                if href:
                    # Skip fragment identifiers, javascript, and mailto
                    if href.startswith('#') or href.startswith('javascript:') or href.startswith('mailto:'):
                        continue
                        
                    full_url = urljoin(url, href)
                    link_text = a.get_text(strip=True)
                    
                    page_data["links"].append({
                        "url": full_url,
                        "text": link_text or "[No Text]"
                    })
                    
                    # Add URL to crawler queue
                    url_manager.add_url(full_url, url)
            
            # Extract all images
            for img in soup.find_all("img", src=True):
                src = img.get("src")
                if src:
                    full_src = urljoin(url, src)
                    alt = img.get("alt", "")
                    
                    # Add to images list
                    img_data = {
                        "src": full_src,
                        "alt": alt,
                        "local_path": None
                    }
                    
                    # Download the image
                    local_path = content_manager.download_image(full_src, url, self.session)
                    if local_path:
                        img_data["local_path"] = local_path
                        
                    page_data["images"].append(img_data)
            
            # Find and analyze components
            for element in soup.find_all(["header", "nav", "section", "footer", "main", "article", "div", "aside"]):
                component_type = content_manager.detect_component_type(element)
                if component_type:
                    component_data = content_manager.extract_component(element, component_type)
                    page_data["components"].append(component_data)
                    
            # If no components found, try to infer structure
            if not page_data["components"]:
                # Try to find main content area
                main_content = soup.find(["main", "div#content", "div#main", "article"])
                if main_content:
                    component_data = content_manager.extract_component(main_content, "main")
                    page_data["components"].append(component_data)
                else:
                    # Use body as fallback
                    body = soup.body
                    if body:
                        component_data = content_manager.extract_component(body, "body")
                        page_data["components"].append(component_data)
            
            # Save page data
            file_paths = content_manager.save_page_content(url, html_content, page_data)
            page_data["file_paths"] = file_paths
            
            # Add page data to URL manager
            url_manager.add_page_data(normalized_url, page_data)
            
            print(f"Successfully analyzed page: {url}")
            print(f"Found {len(page_data['links'])} links and {len(page_data['images'])} images")
            
            return page_data
            
        except Exception as e:
            print(f"Error analyzing page {url}: {e}")
            return None
    
    def analyze_website(self, start_url, max_pages=100):
        """Analyze entire website starting from start_url"""
        if not self.test_connectivity():
            print("Connectivity test failed. Check your internet connection.")
            return None
            
        # Initialize managers
        url_manager = UrlManager(start_url)
        content_manager = ContentManager()
        
        # Add start URL
        url_manager.add_url(start_url)
        
        pages_analyzed = 0
        
        print(f"\nStarting website analysis from {start_url}")
        print(f"Will analyze up to {max_pages} pages")
        
        while pages_analyzed < max_pages:
            # Get next URL to process
            url = url_manager.get_next_url()
            if not url:
                print("No more URLs to process")
                break
                
            # Analyze page
            page_data = self.analyze_page(url, url_manager, content_manager)
            if page_data:
                pages_analyzed += 1
                
            # Print progress
            stats = url_manager.get_stats()
            print(f"Progress: {pages_analyzed}/{max_pages} pages analyzed")
            print(f"URLs: {stats['discovered']} discovered, {stats['visited']} visited, {stats['queued']} queued")
            
            if pages_analyzed % 10 == 0:
                # Generate interim rebuild context to save progress
                self.generate_rebuild_context(url_manager, content_manager)
                
        # Generate final rebuild context
        rebuild_context = self.generate_rebuild_context(url_manager, content_manager)
        
        print(f"\nAnalysis complete!")
        print(f"Analyzed {pages_analyzed} pages")
        print(f"Rebuild context saved to {os.path.join(content_manager.output_dir, 'rebuild_context.json')}")
        
        return rebuild_context
    
    def generate_rebuild_context(self, url_manager, content_manager):
        """Generate rebuild context from analyzed pages"""
        all_pages = url_manager.get_all_pages()
        
        if not all_pages:
            print("No pages data found")
            return None
            
        print(f"Generating rebuild context from {len(all_pages)} pages...")
        
        # Determine site name from home page or first page
        site_name = "Untitled Site"
        home_page = None
        for norm_url, page_data in all_pages.items():
            parsed = urlparse(norm_url)
            if parsed.path == '/' or not parsed.path:
                home_page = page_data
                if page_data["title"]:
                    site_name = page_data["title"]
                break
                
        if not home_page and all_pages:
            # Use first page as fallback
            first_page = list(all_pages.values())[0]
            if first_page["title"]:
                site_name = first_page["title"]
                
        # Get site domain
        base_domain = url_manager.base_domain
        
        # Compile page information
        pages_info = []
        for norm_url, page_data in all_pages.items():
            # Parse URL for display
            parsed = urlparse(page_data["url"])
            display_url = parsed.path
            if not display_url or display_url == "/":
                display_url = "Home Page"
            else:
                display_url = display_url.strip("/")
                
            # Count components by type
            component_counts = defaultdict(int)
            for component in page_data["components"]:
                component_counts[component["type"]] += 1
                
            # Create page summary
            page_info = {
                "url": page_data["url"],
                "display_url": display_url,
                "title": page_data["title"],
                "structure": dict(component_counts),
                "images_count": len(page_data["images"]),
                "local_content": page_data.get("file_paths", {})
            }
            
            pages_info.append(page_info)
            
        # Find color mentions in titles and content
        color_terms = ['blue', 'green', 'red', 'yellow', 'orange', 'purple', 
                       'pink', 'black', 'white', 'gray', 'grey']
        color_mentions = defaultdict(int)
        
        for page_data in all_pages.values():
            title = page_data["title"].lower()
            for color in color_terms:
                if color in title:
                    color_mentions[color] += 2  # Weight title mentions more
                    
            # Check component content for color mentions
            for component in page_data["components"]:
                for heading in component.get("content", {}).get("headings", []):
                    for color in color_terms:
                        if color in heading["text"].lower():
                            color_mentions[color] += 1
                            
        # Determine color scheme
        color_scheme = []
        if color_mentions:
            sorted_colors = sorted(color_mentions.items(), key=lambda x: x[1], reverse=True)
            if sorted_colors:
                primary = sorted_colors[0][0]
                color_scheme.append(f"Primary color: {primary}")
                
                if len(sorted_colors) > 1:
                    secondary = sorted_colors[1][0]
                    color_scheme.append(f"Secondary color: {secondary}")
                    
                if len(sorted_colors) > 2:
                    accent = sorted_colors[2][0]
                    color_scheme.append(f"Accent color: {accent}")
        
        # Collect image statistics
        all_images = []
        for page_data in all_pages.values():
            for img in page_data["images"]:
                if img.get("local_path"):
                    all_images.append(img["local_path"])
                    
        # Create rebuild context
        rebuild_context = {
            "site_structure": {
                "site_name": site_name,
                "domain": base_domain,
                "num_pages": len(all_pages),
                "pages": pages_info
            },
            "tailwind_rebuild_notes": {
                "general": "This site should be rebuilt using Tailwind CSS utility classes for responsive design",
                "components": {
                    "header": "Create a responsive header with Tailwind, using flex or grid for layout",
                    "nav": "Build navigation with Tailwind's utility classes for responsive design",
                    "section": "Use Tailwind's spacing, typography, and color utilities for content sections",
                    "footer": "Create a footer with Tailwind, consider using grid for complex layouts"
                },
                "color_scheme": color_scheme if color_scheme else ["Use neutral color scheme based on Tailwind defaults"],
                "responsive_design": "Ensure the site is fully responsive using Tailwind's breakpoint utilities",
                "accessibility": "Implement proper contrast ratios and ARIA attributes for accessibility"
            },
            "site_map": {
                "total_unique_urls": len(url_manager.discovered_urls),
                "pages_analyzed": len(url_manager.visited_urls),
                "percentage_coverage": round((len(url_manager.visited_urls) / len(url_manager.discovered_urls)) * 100, 2) if url_manager.discovered_urls else 0
            },
            "assets": {
                "total_images_found": sum(len(page["images"]) for page in all_pages.values()),
                "unique_images_downloaded": len(set(all_images)),
                "image_files": list(set(all_images))[:20]  # Show first 20 unique images
            }
        }
        
        # Save the rebuild context
        with open(os.path.join(content_manager.output_dir, "rebuild_context.json"), "w", encoding="utf-8") as f:
            json.dump(rebuild_context, f, indent=2)
            
        return rebuild_context


def try_url_variants(url):
    """Try different URL variants (http/https, www/non-www)"""
    variants = []
    
    # Ensure we have a scheme
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url
        
    parsed = urlparse(url)
    
    # Try both http and https
    if parsed.scheme == 'https':
        http_variant = 'http' + url[5:]
        variants = [url, http_variant]
    elif parsed.scheme == 'http':
        https_variant = 'https' + url[4:]
        variants = [url, https_variant]
    else:
        variants = [f"https://{url}", f"http://{url}"]
        
    # Try with and without www
    all_variants = []
    for variant in variants:
        parsed_variant = urlparse(variant)
        if parsed_variant.netloc.startswith('www.'):
            # Add non-www variant
            non_www = variant.replace(f"{parsed_variant.scheme}://www.", f"{parsed_variant.scheme}://")
            all_variants.extend([variant, non_www])
        else:
            # Add www variant
            www = variant.replace(f"{parsed_variant.scheme}://", f"{parsed_variant.scheme}://www.")
            all_variants.extend([variant, www])
            
    return list(set(all_variants))


def find_working_url(url, session):
    """Find a working URL from various variants"""
    variants = try_url_variants(url)
    print(f"Testing URL variants: {', '.join(variants)}")
    
    for variant in variants:
        try:
            response = session.get(variant, timeout=10)
            if response.status_code == 200:
                print(f"Found working URL: {variant}")
                return variant
        except Exception as e:
            print(f"Error testing {variant}: {e}")
            
    return None


if __name__ == "__main__":
    print("""
╔════════════════════════════════════════════════════════╗
║ Tailwind Site Analyzer                                 ║
║ Advanced website analyzer for Tailwind CSS rebuilding  ║
╚════════════════════════════════════════════════════════╝
    """)
    
    # Test basic internet connection
    try:
        socket.create_connection(("www.google.com", 80))
        print("✓ Internet connection test: Success!")
    except OSError:
        print("⚠ WARNING: Internet connection test failed. You may have connectivity issues.")
        
    print("\nOptions:")
    print("1. Analyze website (regular mode)")
    print("2. Analyze website with headless browser (for JavaScript-heavy sites)")
    print("3. Test with example.com")
    
    choice = input("\nEnter your choice (1, 2, or 3): ").strip()
    
    use_headless = (choice == "2")
    
    if choice == "3":
        print("\nRunning test with example.com...")
        analyzer = WebsiteAnalyzer(use_headless=False)
        analyzer.analyze_website("https://example.com", max_pages=3)
    else:
        # Normal website analysis
        if use_headless and not SELENIUM_AVAILABLE:
            print("Selenium not available. Install with: pip install selenium webdriver-manager")
            print("Continuing in regular mode...")
            use_headless = False
            
        analyzer = WebsiteAnalyzer(use_headless=use_headless)
        
        website_url = input("\nEnter the website URL to analyze: ").strip()
        
        if not website_url:
            print("No URL entered. Exiting.")
            exit(1)
            
        # Find working URL variant
        session = requests.Session()
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
        })
        
        working_url = find_working_url(website_url, session)
        
        if not working_url:
            print("Could not find a working URL variant. Please check the domain and try again.")
            exit(1)
            
        max_pages = input("Enter maximum number of pages to analyze (default: 50): ").strip()
        max_pages = int(max_pages) if max_pages and max_pages.isdigit() else 50
        
        print(f"\nStarting analysis of {working_url}")
        print("This will:")
        print("1. Find and analyze all pages on the site")
        print("2. Extract components, images, and structure")
        print("3. Create a comprehensive rebuild context for Tailwind CSS development")
        
        analyzer.analyze_website(working_url, max_pages)