pgz-sport/scrapers/sport_federations_deep.py_prije_env_deepseek

#!/usr/bin/env python3
import os
# Federation deep scrape — HNS, HPS, HRS
import sys
sys.path.insert(0, '/opt/pgz-sport/scrapers')
from gov_hr_sport_scraper import fetch, extract_text, find_links
from urllib.parse import urljoin, urlparse
import time, re, hashlib, json, psycopg2
from html import unescape
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s')
log = logging.getLogger("fed")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"

ROOTS = {
    "HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"],
    "HPS": ["https://www.hps.hr"],
    "HRS": ["https://www.hrs.hr"],
    "HOK": ["https://www.hok.hr"],  # Hrvatski olimpijski komitet
    "HKS": ["https://www.hks.hr"],  # Hrvatski karatraski savez
}

def harvest():
    conn = psycopg2.connect(DSN); conn.autocommit = True
    cur = conn.cursor()
    total_docs = total_facts = 0

    for fed, roots in ROOTS.items():
        log.info(f"=== {fed} deep ===")
        visited = set(); queue = list(roots)
        while queue and len(visited) < 80:
            url = queue.pop(0)
            if url in visited: continue
            visited.add(url)
            time.sleep(2)
            html, status = fetch(url)
            if not html or status != 200: continue
            log.info(f"  [{status}] {url[:80]}")
            text = extract_text(html)
            if len(text) < 200: continue

            title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
            title = title_m.group(1).strip() if title_m else url[:80]
            sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
            try:
                cur.execute("""INSERT INTO pgz_sport.dokumenti
                    (url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
                    VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
                    (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed))
                total_docs += cur.rowcount
            except: pass

            # Facts ako ima sport-relevant
            if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']):
                chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)]
                for ci, chunk in enumerate(chunks[:3]):
                    if len(chunk) < 200: continue
                    fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32]
                    try:
                        cur.execute("""INSERT INTO dabi.knowledge
                            (fact, category, source, source_refs, confidence, data_hash, created_at)
                            VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now())
                            ON CONFLICT (data_hash) DO NOTHING""",
                            (chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh))
                        total_facts += cur.rowcount
                    except: pass

            # Follow internal links
            host = urlparse(url).hostname
            for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
                u = urljoin(url, m.group(1))
                if urlparse(u).hostname == host and u not in visited and u not in queue:
                    queue.append(u)
                    if len(queue) > 100: break

    log.info(f"TOTAL: docs={total_docs} facts={total_facts}")
    cur.close(); conn.close()

if __name__ == "__main__":
    harvest()