Files
pgz-sport/scrapers/sport_federations_deep.py_prije_env_deepseek

82 lines
3.5 KiB
Python

#!/usr/bin/env python3
import os
# Federation deep scrape — HNS, HPS, HRS
import sys
sys.path.insert(0, '/opt/pgz-sport/scrapers')
from gov_hr_sport_scraper import fetch, extract_text, find_links
from urllib.parse import urljoin, urlparse
import time, re, hashlib, json, psycopg2
from html import unescape
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s')
log = logging.getLogger("fed")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
ROOTS = {
"HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"],
"HPS": ["https://www.hps.hr"],
"HRS": ["https://www.hrs.hr"],
"HOK": ["https://www.hok.hr"], # Hrvatski olimpijski komitet
"HKS": ["https://www.hks.hr"], # Hrvatski karatraski savez
}
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
total_docs = total_facts = 0
for fed, roots in ROOTS.items():
log.info(f"=== {fed} deep ===")
visited = set(); queue = list(roots)
while queue and len(visited) < 80:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
time.sleep(2)
html, status = fetch(url)
if not html or status != 200: continue
log.info(f" [{status}] {url[:80]}")
text = extract_text(html)
if len(text) < 200: continue
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
title = title_m.group(1).strip() if title_m else url[:80]
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
try:
cur.execute("""INSERT INTO pgz_sport.dokumenti
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed))
total_docs += cur.rowcount
except: pass
# Facts ako ima sport-relevant
if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']):
chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)]
for ci, chunk in enumerate(chunks[:3]):
if len(chunk) < 200: continue
fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32]
try:
cur.execute("""INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now())
ON CONFLICT (data_hash) DO NOTHING""",
(chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh))
total_facts += cur.rowcount
except: pass
# Follow internal links
host = urlparse(url).hostname
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
u = urljoin(url, m.group(1))
if urlparse(u).hostname == host and u not in visited and u not in queue:
queue.append(u)
if len(queue) > 100: break
log.info(f"TOTAL: docs={total_docs} facts={total_facts}")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()