import json
import re
from pathlib import Path
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup

CATEGORY_URLS = [
    ("core", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/core-questions"),
    ("parking", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/parking-questions"),
    ("parking-4", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/parking-questions-4"),
    ("parking-5", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/parking-questions-5"),
    ("parking-6", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/parking-questions-6"),
    ("emergency", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/emergency-questions"),
    ("signs-markings", "https://www.nzta.govt.nz/roadcode/theory-test-questions/general-questions/signs-and-markings-questions"),
]

OUT_PATH = Path("web/question_bank.json")

SESSION = requests.Session()
SESSION.headers.update(
    {
        "User-Agent": "Mozilla/5.0 (compatible; nz-quiz-bot/1.0)",
        "Accept-Language": "en-NZ,en;q=0.9",
    }
)

# Matches "C12", "P3", etc at the start of a line
CODE_RE = re.compile(r"^\s*([A-Z]+\d+)\s+")


def fetch_html(url: str) -> str:
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    return r.text


def clean_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()


def parse_questions(category: str, url: str):
    html = fetch_html(url)
    soup = BeautifulSoup(html, "lxml")
    main = soup.find("main") or soup

    questions = []
    seen_codes = set()

    candidates = main.find_all(["p", "li", "div", "h3", "h4"])
    for el in candidates:
        text = clean_text(el.get_text(" ", strip=True))
        if not text:
            continue

        m = CODE_RE.match(text)
        if not m:
            continue

        qcode = m.group(1)
        if qcode in seen_codes:
            continue

        qtext = text

        # Grab short continuation text lines (e.g. "_____ metres")
        continuation = []
        nxt = el
        for _ in range(3):
            nxt = nxt.find_next()
            if not nxt:
                break
            if nxt.name in ["p", "div", "li", "span"]:
                t2 = clean_text(nxt.get_text(" ", strip=True))
                if CODE_RE.match(t2 or ""):
                    break
                if 0 < len(t2) <= 40 and ("____" in t2 or "metre" in t2.lower() or "km/h" in t2.lower()):
                    continuation.append(t2)
            if nxt.name == "hr":
                break

        if continuation:
            qtext = clean_text(qtext + " " + " ".join(continuation))

        # Find first link after the question element (the "find out" link)
        ref_link = None
        a = el.find_next("a")
        if a and a.get("href"):
            ref_link = urljoin(url, a["href"])

        # Find first image after the question element (if any)
        image_url = None
        image_alt = None
        img = el.find_next("img")
        if img and img.get("src"):
            image_url = urljoin(url, img["src"])
            image_alt = clean_text(img.get("alt", "")) or None

        if not ref_link:
            continue

        seen_codes.add(qcode)
        questions.append(
            {
                "id": qcode,
                "category": category,
                "question": qtext,
                "reference_url": ref_link,
                "image_url": image_url,
                "image_alt": image_alt,
                "source_page": url,
            }
        )

    return questions


def main():
    all_questions = []
    for category, url in CATEGORY_URLS:
        try:
            qs = parse_questions(category, url)
            print(f"{category}: {len(qs)} questions")
            all_questions.extend(qs)
        except Exception as e:
            print(f"ERROR scraping {category} {url}: {e}")

    # De-dup by id
    dedup = {}
    for q in all_questions:
        if q["id"] not in dedup:
            dedup[q["id"]] = q

    final = {
        "generated_from": [u for _, u in CATEGORY_URLS],
        "count": len(dedup),
        "questions": list(dedup.values()),
    }

    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    OUT_PATH.write_text(json.dumps(final, indent=2, ensure_ascii=False), encoding="utf-8")
    print(f"\nWrote {OUT_PATH} with {final['count']} questions")


if __name__ == "__main__":
    main()
