117 lines
4.4 KiB
Python
Executable File
117 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr
|
|
search pages directly (not via DDG which is blocked).
|
|
"""
|
|
import re, json, time, sys
|
|
import urllib.request, urllib.parse
|
|
import psycopg2
|
|
|
|
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
|
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
|
|
|
def out(msg): print(msg, flush=True)
|
|
|
|
def http_get(url, timeout=12):
|
|
try:
|
|
req = urllib.request.Request(url, headers={
|
|
"User-Agent": UA, "Accept-Language": "hr,en"
|
|
})
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
return r.read().decode("utf-8", errors="replace")
|
|
except Exception:
|
|
return None
|
|
|
|
def html_to_text(h):
|
|
h = re.sub(r'<script.*?</script>', '', h, flags=re.S)
|
|
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
|
h = re.sub(r'<[^>]+>', ' ', h)
|
|
h = re.sub(r' ', ' ', h)
|
|
h = re.sub(r'&', '&', h)
|
|
h = re.sub(r'&[a-z]+;', '', h)
|
|
h = re.sub(r'\s+', ' ', h).strip()
|
|
return h
|
|
|
|
def relevant_paragraph(text, ime, prez, sport):
|
|
sents = re.split(r'(?<=[.!?])\s+', text)
|
|
relevant = []
|
|
for s in sents:
|
|
sl = s.lower()
|
|
if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60:
|
|
relevant.append(s)
|
|
if len(" ".join(relevant)) > 700: break
|
|
return " ".join(relevant)[:1300]
|
|
|
|
def search_novilist(query):
|
|
"""Novi list search: direct URL"""
|
|
enc = urllib.parse.quote(query)
|
|
h = http_get(f"https://www.novilist.hr/?s={enc}")
|
|
if not h: return []
|
|
return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5]
|
|
|
|
def main():
|
|
conn = psycopg2.connect(**DB); conn.autocommit = True
|
|
cr = conn.cursor()
|
|
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
|
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
|
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
|
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
|
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
|
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
|
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""")
|
|
targets = cr.fetchall()
|
|
out(f"Targets: {len(targets)}")
|
|
|
|
enriched = 0; tried = 0
|
|
|
|
for tid, ime, prez, sport, kat, klub in targets:
|
|
tried += 1
|
|
full = f"{ime} {prez}"
|
|
sport_kw = sport or "sportaš"
|
|
|
|
# Try Novi list (Riječki regional)
|
|
urls = search_novilist(f"{full} {sport_kw}")
|
|
time.sleep(0.4)
|
|
|
|
bio_text = ""; bio_url = None
|
|
for u in urls[:3]:
|
|
if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")):
|
|
continue
|
|
html = http_get(u, timeout=10)
|
|
if not html: continue
|
|
text = html_to_text(html)
|
|
para = relevant_paragraph(text, ime, prez, sport_kw)
|
|
if para and len(para) >= 200:
|
|
bio_text = para; bio_url = u
|
|
break
|
|
time.sleep(0.3)
|
|
|
|
if not bio_text:
|
|
time.sleep(0.3)
|
|
continue
|
|
|
|
try:
|
|
cr.execute("""UPDATE pgz_sport.clanovi
|
|
SET biografija = %s,
|
|
source_url = COALESCE(source_url, %s),
|
|
source_synced_at = now()
|
|
WHERE id = %s""", (bio_text, bio_url, tid))
|
|
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
|
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
|
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000],
|
|
"novilist", 0.85, "biografija_sportasa"))
|
|
enriched += 1
|
|
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
|
except Exception as e:
|
|
out(f" ERR {full}: {e}")
|
|
|
|
if tried % 10 == 0:
|
|
out(f" Progress: tried={tried} enriched={enriched}")
|
|
time.sleep(0.4)
|
|
|
|
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
|
conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|