PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
# Federation deep scrape — HNS, HPS, HRS
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/pgz-sport/scrapers')
|
||||
from gov_hr_sport_scraper import fetch, extract_text, find_links
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time, re, hashlib, json, psycopg2
|
||||
from html import unescape
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s')
|
||||
log = logging.getLogger("fed")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
ROOTS = {
|
||||
"HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"],
|
||||
"HPS": ["https://www.hps.hr"],
|
||||
"HRS": ["https://www.hrs.hr"],
|
||||
"HOK": ["https://www.hok.hr"], # Hrvatski olimpijski komitet
|
||||
"HKS": ["https://www.hks.hr"], # Hrvatski karatraski savez
|
||||
}
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
total_docs = total_facts = 0
|
||||
|
||||
for fed, roots in ROOTS.items():
|
||||
log.info(f"=== {fed} deep ===")
|
||||
visited = set(); queue = list(roots)
|
||||
while queue and len(visited) < 80:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
time.sleep(2)
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200: continue
|
||||
log.info(f" [{status}] {url[:80]}")
|
||||
text = extract_text(html)
|
||||
if len(text) < 200: continue
|
||||
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
|
||||
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed))
|
||||
total_docs += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Facts ako ima sport-relevant
|
||||
if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']):
|
||||
chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)]
|
||||
for ci, chunk in enumerate(chunks[:3]):
|
||||
if len(chunk) < 200: continue
|
||||
fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now())
|
||||
ON CONFLICT (data_hash) DO NOTHING""",
|
||||
(chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh))
|
||||
total_facts += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Follow internal links
|
||||
host = urlparse(url).hostname
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
u = urljoin(url, m.group(1))
|
||||
if urlparse(u).hostname == host and u not in visited and u not in queue:
|
||||
queue.append(u)
|
||||
if len(queue) > 100: break
|
||||
|
||||
log.info(f"TOTAL: docs={total_docs} facts={total_facts}")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Reference in New Issue
Block a user