import os import re import time import requests from urllib.parse import urljoin, urlparse from bs4 import BeautifulSoup # Global sets to reduce duplication seen_css_lines = set() seen_forms = set() unique_urls = set() # To store all unique URLs encountered # Ensure directories exist def prepare_directories(): if not os.path.exists("images"): os.makedirs("images") if not os.path.exists("content"): os.makedirs("content") if not os.path.exists("config"): os.makedirs("config") # Initialize config files open(os.path.join("config", "image-map.txt"), "w").close() open(os.path.join("config", "forms.txt"), "w").close() open(os.path.join("config", "design-over.css"), "w").close() open(os.path.join("config", "urls.txt"), "w").close() # Check if the URL is on the same domain def same_domain(base, target): return urlparse(base).netloc == urlparse(target).netloc # Save page inner text to file (in content directory) def process_text(soup, page_url): text = soup.get_text(separator="\n", strip=True) parsed = urlparse(page_url) # Determine filename based on path if parsed.path in ("/", "", None): filename = "home-content.txt" else: safe_path = parsed.path.strip("/").replace("/", "-") if not safe_path: safe_path = "page" filename = f"{safe_path}-content.txt" filepath = os.path.join("content", filename) try: with open(filepath, "w", encoding="utf-8") as ftext: ftext.write(text) except Exception as e: print(f"Error writing text for {page_url}: {e}") # Process images: download and log mapping (in config directory) def process_images(soup, page_url): for img in soup.find_all("img"): src = img.get("src") if not src: continue full_url = urljoin(page_url, src) try: response = requests.get(full_url, stream=True, timeout=10) if response.status_code == 200: parsed = urlparse(full_url) file_name = os.path.basename(parsed.path) if not file_name: file_name = f"image_{int(time.time()*1000)}.jpg" image_path = os.path.join("images", file_name) with open(image_path, "wb") as f: for chunk in response.iter_content(1024): f.write(chunk) with open(os.path.join("config", "image-map.txt"), "a") as fmap: fmap.write(f"{page_url} -> {full_url} saved as {file_name}\n") except Exception as e: print(f"Error downloading image {full_url}: {e}") # Process forms: log form details, avoiding duplicates def process_forms(soup, page_url): forms = soup.find_all("form") if forms: with open(os.path.join("config", "forms.txt"), "a") as fform: for form in forms: action = form.get("action") method = form.get("method", "GET").upper() # Create a signature string for deduplication inputs = [] for tag in form.find_all(["input", "textarea", "select"]): tag_name = tag.name tag_type = tag.get("type", "") tag_field = tag.get("name", "N/A") inputs.append(f"{tag_name}:{tag_field}:{tag_type}") signature = f"{action}|{method}|{'|'.join(sorted(inputs))}" if signature in seen_forms: continue seen_forms.add(signature) fform.write(f"Page: {page_url}\n") fform.write(f"Form: action={action}, method={method}\n") for item in inputs: fform.write(f" {item}\n") fform.write("\n") # Process CSS: extract inline and external CSS, deduplicate and store only font-face and color information def process_css(soup, page_url): css_content = "" # Inline