# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import xml.etree.ElementTree as ET
import re
import os
import threading
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

lock = threading.Lock()
OUTPUT_FILE = "jobs.ndjson"
CHECKPOINT_FILE = "checkpoint.txt"

def is_blocked(text):
    blocked_indicators = ["cloudflare", "verify_checkpoint", "just a moment", "enable cookies", "403 forbidden"]
    return any(indicator in text.lower() for indicator in blocked_indicators)

def extract_urls_from_sitemap(xml_content):
    urls = []
    try:
        root = ET.fromstring(xml_content)
        namespace = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for url_elem in root.findall("sm:url", namespace):
            loc_elem = url_elem.find("sm:loc", namespace)
            if loc_elem is not None and loc_elem.text:
                urls.append(loc_elem.text.strip())
    except Exception as e:
        print(f"Error parsing XML sitemap: {e}")
    return urls

def extract_salary(base_salary):
    salary = ""
    if isinstance(base_salary, dict):
        value = base_salary.get("value")
        if isinstance(value, dict):
            min_val, max_val = value.get("minValue"), value.get("maxValue")
            single_val, unit = value.get("value"), value.get("unitText", "").lower()
            if min_val and max_val: salary = f"${min_val}-${max_val} {unit}"
            elif single_val: salary = f"${single_val} {unit}"
        elif value: salary = f"${value}"
    return salary.strip()

def extract_fields(job_posting):
    job_title = job_posting.get("title") or job_posting.get("jobTitle", "Remote Job")
    description = job_posting.get("description", "")
    location = "Remote"
    job_loc = job_posting.get("jobLocation")
    if job_loc:
        loc_item = job_loc[0] if isinstance(job_loc, list) else job_loc
        if isinstance(loc_item, dict):
            addr = loc_item.get("address", {})
            if isinstance(addr, dict):
                p = [str(addr.get("addressLocality", "")), str(addr.get("addressRegion", "")), str(addr.get("addressCountry", ""))]
                location = ", ".join([x for x in p if x.strip()])
    
    salary = extract_salary(job_posting.get("baseSalary"))
    details = [salary] if salary else []
    emp = job_posting.get("employmentType", "")
    if emp: details.append(str(emp[0] if isinstance(emp, list) else emp).title())

    return {"job_title": job_title, "body_parts": [description], "location": location, "job_details": details}

def process_and_checkpoint(url, counters, total):
    # Fresh headers for every request to look like a new visitor
    headers = {
        "User-Agent": random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
        ]),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": "https://www.google.com/"
    }
    try:
        time.sleep(random.uniform(1, 3))
        # FAST TIMEOUT (10s) to prevent hanging
        r = requests.get(url, headers=headers, timeout=10) 
        
        if r.status_code == 200 and not is_blocked(r.text):
            soup = BeautifulSoup(r.text, 'html.parser')
            scripts = soup.find_all("script", type="application/ld+json")
            for s in scripts:
                try:
                    data = json.loads(s.string)
                    items = data if isinstance(data, list) else [data]
                    for item in items:
                        if item.get("@type") == "JobPosting":
                            fields = extract_fields(item)
                            with lock:
                                with open(OUTPUT_FILE, "a", encoding="utf-8") as out:
                                    out.write(json.dumps(fields) + "\n")
                except: continue
            
            with lock:
                with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f: f.write(url + "\n")
                counters["done"] += 1
                print(f"✅ [{counters['done']}/{total}] Done: {url}")
        elif r.status_code == 403 or is_blocked(r.text):
            print(f"🛑 BLOCKED on {url}. Changing IP recommended.")
    except Exception as e:
        print(f"⚠️ Skip {url}: Timeout/Error")

def main():
    if not os.path.exists("sitemaps.txt"): return
    with open("sitemaps.txt", "r") as f:
        s_urls = [l.strip() for l in f if l.strip()]

    all_urls = []
    for s in s_urls:
        try:
            r = requests.get(s, timeout=10)
            all_urls.extend(extract_urls_from_sitemap(r.content))
        except: continue

    all_urls = sorted(list(set(all_urls)))
    done = set()
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f: done = {l.strip() for l in f}
    
    todo = [u for u in all_urls if u not in done]
    counters = {"done": 0}
    
    print(f"🚀 Starting! {len(todo)} jobs to go.")
    # USE 3 WORKERS (Safe balance)
    with ThreadPoolExecutor(max_workers=3) as exe:
        list(exe.map(lambda u: process_and_checkpoint(u, counters, len(todo)), todo))

if __name__ == "__main__":
    main()