pgz-sport/scrapers/sport_pgz_full.py_prije_env_deepseek

#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: sport_pgz_full.py | v1.0.0 | 05.05.2026
# Lokacija: /opt/pgz-sport/scrapers/sport_pgz_full.py
# Svrha: Sitemap-driven full crawl of sport-pgz.hr
#   - All 4 sitemaps: objave (1+2), natječaji, stranice
#   - PDF download + OCR ingest
#   - Article parsing → dabi.knowledge ingest
# ═══════════════════════════════════════════════════════════════════
"""sport-pgz.hr complete corpus via sitemap."""
import os, sys, re, time, hashlib, logging, json
from urllib.parse import urljoin, urlparse
import urllib.request
from html import unescape
import psycopg2
from psycopg2.extras import execute_batch

logging.basicConfig(level=logging.INFO, format="%(asctime)s [sport_pgz] %(message)s")
log = logging.getLogger("sport_pgz")

DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)"
SITEMAP_INDEX = "https://sport-pgz.hr/sitemap.xml"
PDF_DIR = "/opt/pgz-sport/data/sport_pgz_pdf"
HTML_DIR = "/opt/pgz-sport/data/sport_pgz_html"

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(HTML_DIR, exist_ok=True)


def fetch(url, timeout=20, retries=3, binary=False):
    for i in range(retries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": UA})
            with urllib.request.urlopen(req, timeout=timeout) as r:
                data = r.read()
                if binary:
                    return data, r.status
                return data.decode("utf-8", errors="replace"), r.status
        except Exception as e:
            log.warning(f"fetch fail attempt {i+1} {url}: {e}")
            time.sleep(3*(i+1))
    return None, 0


def parse_sitemap_index(xml):
    """Return list of sub-sitemap URLs."""
    return re.findall(r"<loc>(https?://[^<]+)</loc>", xml or "")


def parse_sitemap_urls(xml):
    """Return list of (url, lastmod) pairs."""
    out = []
    for m in re.finditer(r"<url>\s*<loc>([^<]+)</loc>(?:\s*<lastmod>([^<]*)</lastmod>)?", xml or ""):
        out.append((m.group(1), m.group(2) or ""))
    return out


def extract_main_text(html):
    if not html: return ""
    h = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.S|re.I)
    h = re.sub(r"<style[^>]*>.*?</style>", "", h, flags=re.S|re.I)
    h = re.sub(r"<nav[^>]*>.*?</nav>", "", h, flags=re.S|re.I)
    h = re.sub(r"<footer[^>]*>.*?</footer>", "", h, flags=re.S|re.I)
    text = re.sub(r"<[^>]+>", " ", h)
    text = unescape(text)
    return re.sub(r"\s+", " ", text).strip()


def extract_title(html):
    m = re.search(r"<title>([^<]+)</title>", html or "", re.I)
    if m:
        return re.sub(r"\s+", " ", unescape(m.group(1))).strip()
    return ""


def find_pdf_links(html, base):
    if not html: return []
    out = set()
    for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
        out.add(urljoin(base, m.group(1)))
    return list(out)


def chunk_text(text, max_len=800):
    """Split into ~800 char chunks, prefer sentence boundaries."""
    if len(text) <= max_len:
        return [text] if text else []
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + max_len, len(text))
        # Find last . or ! or ? or newline
        if end < len(text):
            for sep in [". ", "! ", "? ", "\n"]:
                p = text.rfind(sep, start, end)
                if p > start + max_len // 2:
                    end = p + len(sep)
                    break
        chunks.append(text[start:end].strip())
        start = end
    return [c for c in chunks if len(c) > 50]


def upsert_facts(conn, facts):
    """Bulk insert into dabi.knowledge."""
    if not facts:
        return 0
    cur = conn.cursor()
    rows = []
    for f in facts:
        h = hashlib.md5(f["fact"].encode()).hexdigest()
        rows.append((
            f["fact"], f["source"], f.get("category", "sport_pgz"),
            f.get("confidence", 0.85), h,
            json.dumps({"url": f.get("url", "")}),
        ))
    sql = """
        INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
        VALUES (%s, %s, %s, %s, %s, %s::jsonb)
        ON CONFLICT (data_hash) DO NOTHING
    """
    try:
        execute_batch(cur, sql, rows, page_size=50)
        cnt = cur.rowcount
        cur.close()
        return cnt
    except Exception as e:
        log.error(f"upsert err: {e}")
        return 0


def crawl():
    """Main crawl entry."""
    log.info(f"=== sport-pgz.hr full crawl ===")
    conn = psycopg2.connect(DSN); conn.autocommit = True

    # 1. Get sitemap index
    xml, _ = fetch(SITEMAP_INDEX)
    if not xml:
        log.error("sitemap fetch failed"); return

    sub_sitemaps = parse_sitemap_index(xml)
    log.info(f"sub-sitemaps: {len(sub_sitemaps)}")

    all_urls = []
    for sm in sub_sitemaps:
        sub_xml, _ = fetch(sm)
        if sub_xml:
            urls = parse_sitemap_urls(sub_xml)
            all_urls.extend(urls)
            log.info(f"  {sm}: {len(urls)} urls")

    log.info(f"TOTAL URLs to crawl: {len(all_urls)}")

    # 2. Crawl each URL → text → facts
    total_facts = 0
    crawled = 0
    pdfs_found = []

    for idx, (url, lastmod) in enumerate(all_urls, 1):
        if idx % 20 == 0:
            log.info(f"  progress: {idx}/{len(all_urls)} crawled, {total_facts} facts")

        try:
            html, status = fetch(url, timeout=15)
            if not html:
                continue

            # Save HTML for replay
            url_hash = hashlib.md5(url.encode()).hexdigest()[:16]
            html_path = f"{HTML_DIR}/{url_hash}.html"
            try:
                with open(html_path, "w", encoding="utf-8") as f:
                    f.write(html)
            except Exception:
                pass

            title = extract_title(html)
            text = extract_main_text(html)

            # Collect PDFs
            for pdf_url in find_pdf_links(html, url):
                pdfs_found.append(pdf_url)

            # Build facts
            facts = []
            if title and len(title) > 10:
                facts.append({
                    "fact": f"sport-pgz.hr — {title}",
                    "source": "sport-pgz.hr",
                    "category": "sport_pgz_official",
                    "confidence": 0.92,
                    "url": url,
                })

            for chunk in chunk_text(text, max_len=800):
                if len(chunk) < 80:
                    continue
                facts.append({
                    "fact": chunk,
                    "source": "sport-pgz.hr",
                    "category": "sport_pgz_official",
                    "confidence": 0.88,
                    "url": url,
                })

            inserted = upsert_facts(conn, facts)
            total_facts += inserted
            crawled += 1

            time.sleep(0.5)  # rate limit

        except Exception as e:
            log.warning(f"err {url}: {e}")

    # 3. Download PDFs
    pdfs_set = list(set(pdfs_found))
    log.info(f"PDF links found: {len(pdfs_set)}")
    pdf_downloaded = 0
    for pdf_url in pdfs_set[:200]:  # limit for first run
        try:
            url_hash = hashlib.md5(pdf_url.encode()).hexdigest()[:16]
            pdf_path = f"{PDF_DIR}/{url_hash}.pdf"
            if os.path.exists(pdf_path):
                continue
            data, status = fetch(pdf_url, timeout=30, binary=True)
            if data and status == 200:
                with open(pdf_path, "wb") as f:
                    f.write(data)
                pdf_downloaded += 1
                time.sleep(1)
        except Exception as e:
            log.warning(f"pdf err {pdf_url}: {e}")

    log.info(f"=== DONE: {crawled} pages crawled, {total_facts} facts inserted, {pdf_downloaded} PDFs downloaded ===")
    conn.close()

    return {"crawled": crawled, "facts": total_facts, "pdfs": pdf_downloaded}


if __name__ == "__main__":
    r = crawl()
    print(json.dumps(r))