import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Tuple

def get_url_depth(url: str) -> int:
    """Calculate directory depth of URL."""
    path = urlparse(url).path.strip('/').split('/')
    return len(path)

def should_exclude_url(url: str) -> bool:
    """
    除外すべきURLかどうかを判定
    - リソースファイル（画像、アイコン、CSSなど）
    - 言語パス（/en/など）を含むURL
    - httpスキームのURL
    """
    resource_extensions = {
        '.png', '.jpg', '.jpeg', '.gif', '.ico', '.svg',
        '.css', '.js', '.woff', '.woff2', '.ttf',
        '.webmanifest'
    }
    parsed = urlparse(url)
    path_parts = parsed.path.lower().split('/')
    
    # リソースファイルのチェック
    is_resource = any(parsed.path.lower().endswith(ext) for ext in resource_extensions)
    
    # 言語パスのチェック（/en/などを含むかどうか）
    has_language_path = any(part == 'en' for part in path_parts)
    
    # httpスキームのチェック
    is_http = parsed.scheme == 'http'
    
    return is_resource or has_language_path or is_http

def normalize_url(url: str) -> str:
    """
    Normalize URL by:
    - Removing fragments and query params
    - Ensuring consistent trailing slash handling
    - Converting to lowercase
    """
    parsed = urlparse(url)
    # ベースURLの場合は末尾にスラッシュを付ける
    if not parsed.path or parsed.path == '/':
        path = '/'
    else:
        # その他のURLは末尾のスラッシュを削除
        path = parsed.path.rstrip('/')
    
    return f"{parsed.scheme}://{parsed.netloc.lower()}{path}"

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def get_dynamic_urls(url: str, driver: webdriver.Chrome) -> Set[str]:
    """Extract URLs using Selenium for dynamic content."""
    dynamic_urls = set()
    try:
        print(f"\nExtracting dynamic URLs from: {url}")
        driver.get(url)
        time.sleep(2)  # Wait for dynamic content
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        for link in soup.find_all(['a', 'link'], href=True):
            href = link['href']
            full_url = urljoin(url, href.split('#')[0])
            parsed = urlparse(full_url)
            if parsed.netloc == urlparse(url).netloc:
                dynamic_urls.add(full_url)
                print(f"Found dynamic URL: {full_url}")
    except Exception as e:
        print(f"Error in dynamic extraction: {e}")
    return dynamic_urls

def get_subdirectory_url_list(url: str, max_urls: int = 100) -> List[str]:
    """
    Extract URLs using both static and dynamic methods.
    Combines results, removes duplicates, and sorts by depth.
    
    Args:
        url (str): The base URL to start crawling from
        max_urls (int): Maximum number of URLs to collect (default: 300)
        
    Returns:
        List[str]: Combined list of URLs, sorted by depth (shallow first)
    """
    visited = set()
    result = []  # Store (depth, url) tuples

    def crawl_link(current_url: str, limit: int = max_urls):
        # If we've reached the URL limit, stop
        if len(result) >= limit:
            return
            
        # Normalize URL and check if visited
        normalized_url = normalize_url(current_url)
        if normalized_url not in visited:
            visited.add(normalized_url)
            depth = get_url_depth(normalized_url)
            result.append((depth, normalized_url))

        # Attempt to retrieve and parse subdirectory links
        try:
            resp = requests.get(current_url, timeout=10)
            if resp.status_code != 200:
                return

            soup = BeautifulSoup(resp.text, 'html.parser')
            base_domain = urlparse(url).netloc
            base_path = urlparse(url).path

            # For each link in the page
            for link_tag in soup.find_all('a', href=True):
                if len(result) >= limit:
                    break  # we already have enough URLs

                href = link_tag['href']
                if '#' in href or '?' in href:
                    continue
                    
                abs_url = urljoin(current_url, href)
                normalized_abs_url = normalize_url(abs_url)
                parsed = urlparse(normalized_abs_url)

                # Check if URL belongs to same domain (removed base_path restriction)
                if parsed.netloc == base_domain:
                    if normalized_abs_url not in visited:
                        # Print for debugging
                        print(f"Found URL: {normalized_abs_url}")
                        crawl_link(normalized_abs_url, limit)

        except Exception:
            pass

    # Get static URLs through recursion
    print("\nExtracting static URLs...")
    crawl_link(url, max_urls)
    static_urls = {url_tuple[1] for url_tuple in result}
    print(f"Found {len(static_urls)} static URLs")
    
    # Setup Selenium for dynamic extraction
    print("\nSetting up dynamic extraction...")
    chrome_options = Options()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    try:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=chrome_options)
        # Get dynamic URLs
        print("\nExtracting dynamic URLs...")
        dynamic_urls = get_dynamic_urls(url, driver)
        print(f"Found {len(dynamic_urls)} dynamic URLs")
    finally:
        driver.quit()
    
    # Combine static and dynamic URLs
    all_urls = static_urls.union(dynamic_urls)
    print(f"\nTotal unique URLs found: {len(all_urls)}")
    
    # URLの重複チェックと正規化、リソースファイルの除外
    print("\nRemoving duplicates, normalizing URLs, and filtering resource files...")
    normalized_urls = set()
    
    for url in all_urls:
        # 除外すべきURLはスキップ
        if should_exclude_url(url):
            continue
        # URLを正規化（末尾のスラッシュの処理を含む）
        norm_url = normalize_url(url)
        normalized_urls.add(norm_url)
    
    print(f"Unique URLs after normalization: {len(normalized_urls)}")
    
    # URLの深さを計算してソート
    url_depths = [(url, get_url_depth(url)) for url in normalized_urls]
    sorted_results = sorted(url_depths, key=lambda x: (x[1], x[0]))[:50]
    
    # URLだけを取り出して新しい変数に格納
    url_list = [url for url, _ in sorted_results]
    
    print("-" * 50)
    print("\nURLs only:")
    print(url_list)
    # Return URLs in depth order

    return url_list

if __name__ == "__main__":
    base_url = "https://newgrads.visional.inc/"
    if not base_url.startswith('https://'):
        raise ValueError("Base URL must use HTTPS scheme")
    print(f"\nExtracting URLs from: {base_url}")
    print("-" * 50)
    
    try:
        urls = get_subdirectory_urls(base_url)
        print(f"\nSuccessfully extracted {len(urls)} URLs")
    except Exception as e:
        print(f"Error during URL extraction: {e}")
        raise