PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)

This commit is contained in:
Damir Radulić
2026-05-04 23:39:08 +02:00
commit a7ec0a86be
1820 changed files with 694455 additions and 0 deletions
+142
View File
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
Faster + more reliable than search-based approaches.
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
def out(msg): print(msg, flush=True)
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
CURATED = [
# Olympic medalists (PGŽ historical heroes)
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
# 2025 stars
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
("Sandra Delija", None, "Sandra Delija"),
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
# Football (HNK Rijeka stars)
("Niko Janković", None, None),
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
("Duje Čop", "Duje Čop", "Duje Čop"),
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
("Cherno Saho", None, "Cherno Saho"),
("Bruno Goda", None, None),
("Marco Pašalić", None, "Marco Pašalić"),
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
# Coaches
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
# Vaterpolo PGŽ
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
# Boćanje legends
("Karlo Šaban", None, None),
("Carrolina Ban", None, None),
# Karate
("Ema Sgardelli", None, "Ema Sgardelli"),
# Atletika
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
enriched = 0; tried = 0
for full, hr_title, en_title in CURATED:
tried += 1
ime, prez = full.split(" ", 1) if " " in full else (full, "")
# Find clan record
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
LIMIT 1""", (ime, prez))
row = cr.fetchone()
if not row:
out(f" - {full} not in clanovi"); continue
cid, sport, klub_id = row
# Fetch wiki - try hr first then en
s = None; wlang = None
for lang, title in [("hr", hr_title), ("en", en_title)]:
if not title: continue
s = wiki_summary(title, lang)
if s and s.get("type") in ("standard", None):
wlang = lang; break
time.sleep(0.2)
if not s or not s.get("extract"):
out(f"{full} - no wiki page")
continue
extract = s["extract"].strip()[:1500]
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s, source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract, wurl, cid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
time.sleep(0.3)
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
# Summary
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
total = cr.fetchone()[0]
out(f"\nTotal sportaša s bio > 200 chars: {total}")
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
out("\nTop bios:")
for r in cr.fetchall():
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
conn.close()
if __name__ == "__main__":
main()
+142
View File
@@ -0,0 +1,142 @@
#!/usr/bin/env python3
"""
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
Faster + more reliable than search-based approaches.
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
def out(msg): print(msg, flush=True)
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
CURATED = [
# Olympic medalists (PGŽ historical heroes)
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
# 2025 stars
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
("Sandra Delija", None, "Sandra Delija"),
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
# Football (HNK Rijeka stars)
("Niko Janković", None, None),
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
("Duje Čop", "Duje Čop", "Duje Čop"),
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
("Cherno Saho", None, "Cherno Saho"),
("Bruno Goda", None, None),
("Marco Pašalić", None, "Marco Pašalić"),
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
# Coaches
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
# Vaterpolo PGŽ
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
# Boćanje legends
("Karlo Šaban", None, None),
("Carrolina Ban", None, None),
# Karate
("Ema Sgardelli", None, "Ema Sgardelli"),
# Atletika
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
enriched = 0; tried = 0
for full, hr_title, en_title in CURATED:
tried += 1
ime, prez = full.split(" ", 1) if " " in full else (full, "")
# Find clan record
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
LIMIT 1""", (ime, prez))
row = cr.fetchone()
if not row:
out(f" - {full} not in clanovi"); continue
cid, sport, klub_id = row
# Fetch wiki - try hr first then en
s = None; wlang = None
for lang, title in [("hr", hr_title), ("en", en_title)]:
if not title: continue
s = wiki_summary(title, lang)
if s and s.get("type") in ("standard", None):
wlang = lang; break
time.sleep(0.2)
if not s or not s.get("extract"):
out(f" ✗ {full} - no wiki page")
continue
extract = s["extract"].strip()[:1500]
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s, source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract, wurl, cid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
time.sleep(0.3)
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
# Summary
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
total = cr.fetchone()[0]
out(f"\nTotal sportaša s bio > 200 chars: {total}")
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
out("\nTop bios:")
for r in cr.fetchall():
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
conn.close()
if __name__ == "__main__":
main()
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr
search pages directly (not via DDG which is blocked).
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
def out(msg): print(msg, flush=True)
def http_get(url, timeout=12):
try:
req = urllib.request.Request(url, headers={
"User-Agent": UA, "Accept-Language": "hr,en"
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def html_to_text(h):
h = re.sub(r'<script.*?</script>', '', h, flags=re.S)
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
h = re.sub(r'<[^>]+>', ' ', h)
h = re.sub(r'&nbsp;', ' ', h)
h = re.sub(r'&amp;', '&', h)
h = re.sub(r'&[a-z]+;', '', h)
h = re.sub(r'\s+', ' ', h).strip()
return h
def relevant_paragraph(text, ime, prez, sport):
sents = re.split(r'(?<=[.!?])\s+', text)
relevant = []
for s in sents:
sl = s.lower()
if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60:
relevant.append(s)
if len(" ".join(relevant)) > 700: break
return " ".join(relevant)[:1300]
def search_novilist(query):
"""Novi list search: direct URL"""
enc = urllib.parse.quote(query)
h = http_get(f"https://www.novilist.hr/?s={enc}")
if not h: return []
return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
sport_kw = sport or "sportaš"
# Try Novi list (Riječki regional)
urls = search_novilist(f"{full} {sport_kw}")
time.sleep(0.4)
bio_text = ""; bio_url = None
for u in urls[:3]:
if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")):
continue
html = http_get(u, timeout=10)
if not html: continue
text = html_to_text(html)
para = relevant_paragraph(text, ime, prez, sport_kw)
if para and len(para) >= 200:
bio_text = para; bio_url = u
break
time.sleep(0.3)
if not bio_text:
time.sleep(0.3)
continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""", (bio_text, bio_url, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000],
"novilist", 0.85, "biografija_sportasa"))
enriched += 1
out(f"{full} - {len(bio_text)} chars from {bio_url[:80]}")
except Exception as e:
out(f" ERR {full}: {e}")
if tried % 10 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.4)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+116
View File
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""
D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr
search pages directly (not via DDG which is blocked).
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
def out(msg): print(msg, flush=True)
def http_get(url, timeout=12):
try:
req = urllib.request.Request(url, headers={
"User-Agent": UA, "Accept-Language": "hr,en"
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def html_to_text(h):
h = re.sub(r'<script.*?</script>', '', h, flags=re.S)
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
h = re.sub(r'<[^>]+>', ' ', h)
h = re.sub(r'&nbsp;', ' ', h)
h = re.sub(r'&amp;', '&', h)
h = re.sub(r'&[a-z]+;', '', h)
h = re.sub(r'\s+', ' ', h).strip()
return h
def relevant_paragraph(text, ime, prez, sport):
sents = re.split(r'(?<=[.!?])\s+', text)
relevant = []
for s in sents:
sl = s.lower()
if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60:
relevant.append(s)
if len(" ".join(relevant)) > 700: break
return " ".join(relevant)[:1300]
def search_novilist(query):
"""Novi list search: direct URL"""
enc = urllib.parse.quote(query)
h = http_get(f"https://www.novilist.hr/?s={enc}")
if not h: return []
return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
sport_kw = sport or "sportaš"
# Try Novi list (Riječki regional)
urls = search_novilist(f"{full} {sport_kw}")
time.sleep(0.4)
bio_text = ""; bio_url = None
for u in urls[:3]:
if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")):
continue
html = http_get(u, timeout=10)
if not html: continue
text = html_to_text(html)
para = relevant_paragraph(text, ime, prez, sport_kw)
if para and len(para) >= 200:
bio_text = para; bio_url = u
break
time.sleep(0.3)
if not bio_text:
time.sleep(0.3)
continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""", (bio_text, bio_url, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000],
"novilist", 0.85, "biografija_sportasa"))
enriched += 1
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
except Exception as e:
out(f" ERR {full}: {e}")
if tried % 10 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.4)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
def out(msg):
print(msg, flush=True)
def http_get(url, timeout=12):
try:
req = urllib.request.Request(url, headers={
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "hr,en"
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def ddg_search(query, limit=3):
"""DuckDuckGo HTML search. Returns list of (url, snippet)."""
q = urllib.parse.quote(query)
url = f"https://html.duckduckgo.com/html/?q={q}"
h = http_get(url)
if not h: return []
results = []
# Extract <a class="result__a" href="..."> or "result__url"
for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
link = urllib.parse.unquote(m.group(1))
title = m.group(2).strip()
# Strip DDG redirect
m2 = re.search(r'uddg=([^&]+)', link)
if m2: link = urllib.parse.unquote(m2.group(1))
results.append((link, title))
if len(results) >= limit: break
return results
def html_to_text(html):
h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
h = re.sub(r'<[^>]+>', ' ', h)
h = re.sub(r'&nbsp;', ' ', h)
h = re.sub(r'&amp;', '&', h)
h = re.sub(r'&quot;', '"', h)
h = re.sub(r'&#\d+;', '', h)
h = re.sub(r'\s+', ' ', h)
return h
def relevant_paragraph(text, ime, prez, sport):
"""Extract first relevant sentence(s) that mention name + sport."""
sents = re.split(r'(?<=[.!?])\s+', text)
full_name = f"{ime} {prez}"
relevant = []
for s in sents:
sl = s.lower()
if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
relevant.append(s)
if len(" ".join(relevant)) > 800: break
return " ".join(relevant)[:1500]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
sport_kw = sport or "sportaš"
# DuckDuckGo query
query = f'"{full}" {sport_kw} Rijeka'
results = ddg_search(query, limit=3)
if not results:
time.sleep(0.5)
continue
bio_text = ""
bio_url = None
for link, title in results[:3]:
# Skip non-news domains: facebook, instagram, hns.family etc
if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
"x.com", "tiktok")): continue
html = http_get(link, timeout=10)
if not html: continue
text = html_to_text(html)
para = relevant_paragraph(text, ime, prez, sport_kw)
if para and len(para) >= 200:
bio_text = para
bio_url = link
break
time.sleep(0.3)
if not bio_text:
time.sleep(0.4)
continue
# Insert
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""", (bio_text, bio_url, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
"online_news", 0.85, "biografija_sportasa"))
enriched += 1
out(f"{full} - {len(bio_text)} chars from {bio_url[:80]}")
except Exception as e:
out(f" ERR {full}: {e}")
if tried % 15 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.5)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
"""
D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
"""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
def out(msg):
print(msg, flush=True)
def http_get(url, timeout=12):
try:
req = urllib.request.Request(url, headers={
"User-Agent": UA,
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "hr,en"
})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception:
return None
def ddg_search(query, limit=3):
"""DuckDuckGo HTML search. Returns list of (url, snippet)."""
q = urllib.parse.quote(query)
url = f"https://html.duckduckgo.com/html/?q={q}"
h = http_get(url)
if not h: return []
results = []
# Extract <a class="result__a" href="..."> or "result__url"
for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
link = urllib.parse.unquote(m.group(1))
title = m.group(2).strip()
# Strip DDG redirect
m2 = re.search(r'uddg=([^&]+)', link)
if m2: link = urllib.parse.unquote(m2.group(1))
results.append((link, title))
if len(results) >= limit: break
return results
def html_to_text(html):
h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
h = re.sub(r'<[^>]+>', ' ', h)
h = re.sub(r'&nbsp;', ' ', h)
h = re.sub(r'&amp;', '&', h)
h = re.sub(r'&quot;', '"', h)
h = re.sub(r'&#\d+;', '', h)
h = re.sub(r'\s+', ' ', h)
return h
def relevant_paragraph(text, ime, prez, sport):
"""Extract first relevant sentence(s) that mention name + sport."""
sents = re.split(r'(?<=[.!?])\s+', text)
full_name = f"{ime} {prez}"
relevant = []
for s in sents:
sl = s.lower()
if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
relevant.append(s)
if len(" ".join(relevant)) > 800: break
return " ".join(relevant)[:1500]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
sport_kw = sport or "sportaš"
# DuckDuckGo query
query = f'"{full}" {sport_kw} Rijeka'
results = ddg_search(query, limit=3)
if not results:
time.sleep(0.5)
continue
bio_text = ""
bio_url = None
for link, title in results[:3]:
# Skip non-news domains: facebook, instagram, hns.family etc
if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
"x.com", "tiktok")): continue
html = http_get(link, timeout=10)
if not html: continue
text = html_to_text(html)
para = relevant_paragraph(text, ime, prez, sport_kw)
if para and len(para) >= 200:
bio_text = para
bio_url = link
break
time.sleep(0.3)
if not bio_text:
time.sleep(0.4)
continue
# Insert
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""", (bio_text, bio_url, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
"online_news", 0.85, "biografija_sportasa"))
enriched += 1
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
except Exception as e:
out(f" ERR {full}: {e}")
if tried % 15 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.5)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+172
View File
@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
D: Wikipedia/online enrichment for top sportašs.
For each athlete: fetch hr.wikipedia + en.wikipedia summary,
extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts.
"""
import re, json, time
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
DELAY = 0.5
def http_get(url, timeout=15):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404: return None
return None
except Exception:
return None
def wiki_summary(title, lang="hr"):
"""Use Wikipedia REST API for clean summary."""
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try:
return json.loads(raw)
except Exception:
return None
def wiki_search(query, lang="hr"):
"""Find best Wikipedia title for a person."""
enc = urllib.parse.quote(query)
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json"
raw = http_get(url)
if not raw: return []
try:
d = json.loads(raw)
return list(zip(d[1], d[3])) # (title, url)
except Exception:
return []
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Get target list: HOO kat 1-3 + SP/EP/OI medalisti
cr.execute("""
SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))
ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime
""")
targets = cr.fetchall()
print(f"Targets: {len(targets)}")
enriched = 0
fact_count = 0
for tid, ime, prez, sport, kat, bio, klub in targets:
if bio and len(bio) > 200:
continue # already enriched
full = f"{ime} {prez}"
# Try HR wiki first
summary = None
wiki_title = None
wiki_lang = None
wiki_url = None
for lang in ["hr", "en"]:
# Direct title try
s = wiki_summary(full, lang)
if s and s.get("type") == "standard" and not s.get("disambiguation"):
# Sanity check: must mention sport in description or extract
desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower()
if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu",
"ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač",
"plivat", "boxer", "olimpij", "bonifac", "athlete", "compete",
"sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")):
summary = s
wiki_title = s.get("title")
wiki_lang = lang
wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page")
break
time.sleep(DELAY)
# Search fallback
results = wiki_search(full, lang)
for title, url in results:
# Skip disambiguations
if "razdvojba" in title.lower() or "disambiguation" in title.lower():
continue
s2 = wiki_summary(title, lang)
if not s2: continue
desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower()
if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič",
"ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete",
"compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")):
# Must mention sport-relevant keyword OR our sport
if sport and sport.lower()[:5] in desc:
summary = s2; wiki_title = title; wiki_lang = lang
wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page")
break
time.sleep(DELAY)
if summary: break
if not summary or not summary.get("extract"):
continue
extract = summary.get("extract", "").strip()[:1500]
if len(extract) < 80:
continue
# Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY)
dob = None
m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower())
if m: dob = m.group(1)
# Update clanovi
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source = COALESCE(source, %s),
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract, f"wikipedia_{wiki_lang}", wiki_url, tid))
enriched += 1
except Exception as e:
print(f" ERR update {full}: {e}")
continue
# Insert as fact in dabi.knowledge
fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})"
try:
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa"))
if cr.rowcount: fact_count += 1
except Exception:
pass
print(f"{full} ({wiki_lang}) {len(extract)} chars")
time.sleep(DELAY)
print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===")
# Sample bios
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len
FROM pgz_sport.clanovi
WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100
AND source LIKE 'wikipedia%'
ORDER BY bio_len DESC LIMIT 15""")
print("\nTop bios:")
for r in cr.fetchall():
print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars")
conn.close()
if __name__ == "__main__":
main()
+172
View File
@@ -0,0 +1,172 @@
#!/usr/bin/env python3
"""
D: Wikipedia/online enrichment for top sportašs.
For each athlete: fetch hr.wikipedia + en.wikipedia summary,
extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts.
"""
import re, json, time
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
DELAY = 0.5
def http_get(url, timeout=15):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code == 404: return None
return None
except Exception:
return None
def wiki_summary(title, lang="hr"):
"""Use Wikipedia REST API for clean summary."""
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try:
return json.loads(raw)
except Exception:
return None
def wiki_search(query, lang="hr"):
"""Find best Wikipedia title for a person."""
enc = urllib.parse.quote(query)
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json"
raw = http_get(url)
if not raw: return []
try:
d = json.loads(raw)
return list(zip(d[1], d[3])) # (title, url)
except Exception:
return []
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Get target list: HOO kat 1-3 + SP/EP/OI medalisti
cr.execute("""
SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv
FROM pgz_sport.clanovi c
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))
ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime
""")
targets = cr.fetchall()
print(f"Targets: {len(targets)}")
enriched = 0
fact_count = 0
for tid, ime, prez, sport, kat, bio, klub in targets:
if bio and len(bio) > 200:
continue # already enriched
full = f"{ime} {prez}"
# Try HR wiki first
summary = None
wiki_title = None
wiki_lang = None
wiki_url = None
for lang in ["hr", "en"]:
# Direct title try
s = wiki_summary(full, lang)
if s and s.get("type") == "standard" and not s.get("disambiguation"):
# Sanity check: must mention sport in description or extract
desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower()
if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu",
"ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač",
"plivat", "boxer", "olimpij", "bonifac", "athlete", "compete",
"sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")):
summary = s
wiki_title = s.get("title")
wiki_lang = lang
wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page")
break
time.sleep(DELAY)
# Search fallback
results = wiki_search(full, lang)
for title, url in results:
# Skip disambiguations
if "razdvojba" in title.lower() or "disambiguation" in title.lower():
continue
s2 = wiki_summary(title, lang)
if not s2: continue
desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower()
if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič",
"ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete",
"compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")):
# Must mention sport-relevant keyword OR our sport
if sport and sport.lower()[:5] in desc:
summary = s2; wiki_title = title; wiki_lang = lang
wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page")
break
time.sleep(DELAY)
if summary: break
if not summary or not summary.get("extract"):
continue
extract = summary.get("extract", "").strip()[:1500]
if len(extract) < 80:
continue
# Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY)
dob = None
m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower())
if m: dob = m.group(1)
# Update clanovi
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source = COALESCE(source, %s),
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract, f"wikipedia_{wiki_lang}", wiki_url, tid))
enriched += 1
except Exception as e:
print(f" ERR update {full}: {e}")
continue
# Insert as fact in dabi.knowledge
fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})"
try:
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa"))
if cr.rowcount: fact_count += 1
except Exception:
pass
print(f" ✓ {full} ({wiki_lang}) {len(extract)} chars")
time.sleep(DELAY)
print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===")
# Sample bios
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len
FROM pgz_sport.clanovi
WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100
AND source LIKE 'wikipedia%'
ORDER BY bio_len DESC LIMIT 15""")
print("\nTop bios:")
for r in cr.fetchall():
print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars")
conn.close()
if __name__ == "__main__":
main()
+94
View File
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""D v2: simpler & faster wiki enrichment with stdout flush."""
import re, json, time, sys
import urllib.request, urllib.parse, urllib.error
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
def out(msg):
print(msg, flush=True)
sys.stdout.flush()
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 100""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
# Try direct title match HR + EN
for lang in ["hr", "en"]:
s = wiki_summary(full, lang)
if not s: continue
if s.get("type") not in ("standard", None): continue
extract = (s.get("extract") or "").strip()
if not extract or len(extract) < 80: continue
# Quality check: must mention sport keyword
t = (extract + " " + (s.get("description") or "")).lower()
sport_kws = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist",
"plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer",
"diver", "skier", "sailor", "vesla", "ringa", "gimnast")
if not any(kw in t for kw in sport_kws):
continue
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
if not wurl: continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source = COALESCE(NULLIF(source, ''), %s),
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract[:1500], f"wikipedia_{lang}", wurl, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{lang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
break # found, no need to try other lang
if tried % 20 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.3)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+94
View File
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
"""D v2: simpler & faster wiki enrichment with stdout flush."""
import re, json, time, sys
import urllib.request, urllib.parse, urllib.error
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
def out(msg):
print(msg, flush=True)
sys.stdout.flush()
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 100""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
# Try direct title match HR + EN
for lang in ["hr", "en"]:
s = wiki_summary(full, lang)
if not s: continue
if s.get("type") not in ("standard", None): continue
extract = (s.get("extract") or "").strip()
if not extract or len(extract) < 80: continue
# Quality check: must mention sport keyword
t = (extract + " " + (s.get("description") or "")).lower()
sport_kws = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil",
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist",
"plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer",
"diver", "skier", "sailor", "vesla", "ringa", "gimnast")
if not any(kw in t for kw in sport_kws):
continue
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
if not wurl: continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source = COALESCE(NULLIF(source, ''), %s),
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract[:1500], f"wikipedia_{lang}", wurl, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{lang}] {full} - {len(extract)} chars")
except Exception as e:
out(f" ERR {full}: {e}")
break # found, no need to try other lang
if tried % 20 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.3)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+112
View File
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback."""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
def out(msg):
print(msg, flush=True)
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def wiki_search(query, lang="hr"):
enc = urllib.parse.quote(query)
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json"
raw = http_get(url)
if not raw: return []
try:
d = json.loads(raw)
return list(zip(d[1], d[3]))
except Exception:
return []
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač",
"boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač",
"athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor",
"vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov",
"ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving")
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
found = False
for lang in ["hr", "en"]:
# Try search
results = wiki_search(full, lang)
for title, url_link in results[:3]:
if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue
s = wiki_summary(title, lang)
if not s: continue
if s.get("type") not in ("standard", None): continue
extract = (s.get("extract") or "").strip()
if not extract or len(extract) < 80: continue
# Match: must contain at least sport keyword AND surname
tlower = (extract + " " + (s.get("description") or "")).lower()
if prez.lower() not in tlower: continue # not about same person
if not any(kw in tlower for kw in SPORT_KW): continue
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
if not wurl: continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract[:1500], wurl, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars")
found = True
break
except Exception as e:
out(f" ERR {full}: {e}")
time.sleep(0.2)
if found: break
if tried % 25 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.2)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+112
View File
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback."""
import re, json, time, sys
import urllib.request, urllib.parse
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
def out(msg):
print(msg, flush=True)
def http_get(url, timeout=10):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
return None
def wiki_search(query, lang="hr"):
enc = urllib.parse.quote(query)
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json"
raw = http_get(url)
if not raw: return []
try:
d = json.loads(raw)
return list(zip(d[1], d[3]))
except Exception:
return []
def wiki_summary(title, lang="hr"):
enc = urllib.parse.quote(title.replace(" ", "_"))
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
raw = http_get(url)
if not raw: return None
try: return json.loads(raw)
except Exception: return None
SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač",
"boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač",
"athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor",
"vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov",
"ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving")
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
WHERE (c.kategorija_hoo IN (1, 2, 3)
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""")
targets = cr.fetchall()
out(f"Targets: {len(targets)}")
enriched = 0; tried = 0
for tid, ime, prez, sport, kat, klub in targets:
tried += 1
full = f"{ime} {prez}"
found = False
for lang in ["hr", "en"]:
# Try search
results = wiki_search(full, lang)
for title, url_link in results[:3]:
if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue
s = wiki_summary(title, lang)
if not s: continue
if s.get("type") not in ("standard", None): continue
extract = (s.get("extract") or "").strip()
if not extract or len(extract) < 80: continue
# Match: must contain at least sport keyword AND surname
tlower = (extract + " " + (s.get("description") or "")).lower()
if prez.lower() not in tlower: continue # not about same person
if not any(kw in tlower for kw in SPORT_KW): continue
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
if not wurl: continue
try:
cr.execute("""UPDATE pgz_sport.clanovi
SET biografija = %s,
source_url = COALESCE(source_url, %s),
source_synced_at = now()
WHERE id = %s""",
(extract[:1500], wurl, tid))
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
enriched += 1
out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars")
found = True
break
except Exception as e:
out(f" ERR {full}: {e}")
time.sleep(0.2)
if found: break
if tried % 25 == 0:
out(f" Progress: tried={tried} enriched={enriched}")
time.sleep(0.2)
out(f"=== DONE: tried={tried} enriched={enriched} ===")
conn.close()
if __name__ == "__main__":
main()
+67
View File
@@ -0,0 +1,67 @@
#!/usr/bin/env python3
# clan_oib_enricher.py — match clanovi s civic.persons po imenu+prezimenu+grad
import os, sys, logging, re
import psycopg2
from psycopg2.extras import RealDictCursor
logging.basicConfig(level=logging.INFO, format='%(asctime)s [clan_oib] %(message)s')
log = logging.getLogger("clan_oib")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def main():
conn = psycopg2.connect(DSN, cursor_factory=RealDictCursor)
conn.autocommit = True
cur = conn.cursor()
cur.execute("""
SELECT id, ime, prezime, grad, datum_rodenja
FROM pgz_sport.clanovi
WHERE (oib IS NULL OR length(oib) != 11)
AND ime IS NOT NULL AND prezime IS NOT NULL
AND length(ime) > 2 AND length(prezime) > 2
ORDER BY id
LIMIT 5000
""")
clanovi = cur.fetchall()
log.info(f"Members for OIB enrichment: {len(clanovi)}")
enriched = 0
for c in clanovi:
# Match s civic.persons po ime + prezime + (grad ili datum)
sql = """
SELECT oib, name
FROM civic.persons
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) LIKE %s
LIMIT 5
"""
try:
cur.execute(sql, (f'%{c["ime"]}%{c["prezime"]}%',))
matches = cur.fetchall()
except Exception as e:
# unaccent might not exist
cur.execute("ROLLBACK")
sql2 = """
SELECT oib, name FROM civic.persons
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) LIKE %s
LIMIT 5
"""
cur.execute(sql2, (f'%{c["ime"].lower()}%{c["prezime"].lower()}%',))
matches = cur.fetchall()
if len(matches) == 1:
# Unique match — update
cur.execute("UPDATE pgz_sport.clanovi SET oib = %s WHERE id = %s AND (oib IS NULL OR length(oib) != 11)",
(matches[0]['oib'], c['id']))
enriched += 1
if enriched % 100 == 0:
log.info(f"Progress: {enriched} enriched")
log.info(f"FINAL: {enriched} clanovi enriched (unique matches only)")
cur.close()
conn.close()
if __name__ == "__main__":
main()
+78
View File
@@ -0,0 +1,78 @@
#!/usr/bin/env python3
# Seed cultural_qa training data iz dabi.knowledge (Alan Ford, čakavski, satrovacki, izreke)
import psycopg2, hashlib, logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa] %(message)s')
log = logging.getLogger("cult_qa")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Get cultural facts
cur.execute("""
SELECT id, fact, category FROM dabi.knowledge
WHERE category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik')
AND fact IS NOT NULL AND length(fact) > 30
LIMIT 1000
""")
rows = cur.fetchall()
log.info(f"Cultural facts: {len(rows)}")
inserted = 0
for fid, fact, cat in rows:
# Generate Q variants based on category
questions = []
fact_lower = fact.lower()
if 'alan_ford' in cat:
# Try extract character
for char in ['Alan Ford', 'Bob Rock', 'Sir Oliver', 'Broj Jedan', 'Grunf', 'Jeremija', 'Šef', 'Margot', 'Superhik', 'Notar', 'Cifra Sluga']:
if char.lower() in fact_lower:
questions.append(f"Tko je {char}?")
questions.append(f"Što znaš o liku {char}?")
break
if not questions:
questions.append(f"Što znaš o Alan Fordu?")
elif 'satrovacki' in cat:
# First word in fact = the term
words = fact.split()
if words and len(words[0]) > 2:
term = words[0].rstrip('=,:;.').strip()
questions.append(f"Što znači {term}?")
questions.append(f"Što je {term} na šatrovačkom?")
elif 'cakavski' in cat or 'fjumanski' in cat:
words = fact.split()
if words and len(words[0]) > 2:
term = words[0].rstrip('=,:;.').strip()
questions.append(f"Što znači {term}?")
questions.append(f"Što je {term} u riječkom dijalektu?")
elif 'rijeka_izreka' in cat:
questions.append(f"Reci mi neku riječku izreku.")
questions.append(f"Koje su tradicionalne riječke izreke?")
# Save Q&A pairs
for q in questions[:2]: # max 2 per fact
qa_hash = hashlib.sha256(f"cult:{fid}:{q[:60]}".encode()).hexdigest()[:32]
try:
cur.execute("""
INSERT INTO dabi.training_qa
(question, answer, category, source_type, created_at)
VALUES (%s, %s, %s, 'cultural_seed', now())
ON CONFLICT DO NOTHING
""", (q[:300], fact[:500], 'cultural_'+cat.split('_')[0]))
inserted += cur.rowcount
except Exception as e:
if inserted < 3: log.warning(f"insert err: {e}")
log.info(f"Inserted: {inserted} cultural Q&A pairs")
cur.close(); conn.close()
if __name__ == "__main__":
main()
+89
View File
@@ -0,0 +1,89 @@
#!/usr/bin/env python3
# Proširen cultural Q&A seed (svaki fact daje 3-5 varijanti pitanja)
import psycopg2, hashlib, logging, re
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa2] %(message)s')
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Lokalni Riječki + dijalekti facts
cur.execute("""
SELECT id, fact, category FROM dabi.knowledge
WHERE (category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik',
'pgz_administracija','pgz_promet','rijeka_lokali','rijeka_lokal')
OR fact ~ '\\m(žišku|brodo|rista|vopi|kantun|ponistra|šugaman)\\M'
OR fact ~ '\\m(Alan Ford|Bob Rock|Sir Oliver|TNT|Grunf)\\M')
AND fact IS NOT NULL AND length(fact) > 30 AND length(fact) < 1500
LIMIT 2000
""")
rows = cur.fetchall()
logging.info(f"Cultural facts proširen: {len(rows)}")
inserted = 0
for fid, fact, cat in rows:
questions = []
fl = fact.lower()
# Alan Ford characters
characters = ['Alan Ford','Bob Rock','Sir Oliver','Broj Jedan','Grunf','Jeremija','Šef','Margot','Superhik','Notar','Cifra Sluga','Don Galon','Debela Gilda']
for ch in characters:
if ch.lower() in fl:
questions.extend([f"Tko je {ch}?", f"Što znaš o {ch}?", f"Kakav je lik {ch}?"])
break
# Šatrovački/čakavski/fjumanski — extract first word as term
if any(k in cat.lower() for k in ['satrovacki', 'cakavski', 'fjumanski', 'lokalni']):
# Extract first interesting word (not common noun)
words = re.findall(r'\b\w+\b', fact)
for w in words[:3]:
if len(w) >= 3 and w.lower() not in ['ova', 'taj', 'jest', 'znači', 'što', 'kako', 'tko', 'gdje']:
questions.append(f"Što znači riječ {w}?")
questions.append(f"Što je {w}?")
break
# Riječke izreke
if 'izrek' in cat.lower() or 'izrek' in fl:
questions.append("Reci mi neku riječku izreku.")
questions.append("Imaš li primjer riječke poslovice?")
# General Rijeka context
if 'rijeka' in fl or 'kvarner' in fl or 'trsat' in fl or 'preluk' in fl:
questions.append("Što mi možeš reći o Rijeci?")
if not questions:
# Fallback Q based on category
cat_q = {
'alan_ford': 'Pričaj mi nešto o Alan Fordu.',
'cakavski': 'Pričaj mi o čakavskom dijalektu.',
'satrovacki': 'Što je šatrovački?',
'fjumanski': 'Što je fjumanski?',
'rijeka': 'Što je posebno za Rijeku?'
}
for k, v in cat_q.items():
if k in cat.lower():
questions.append(v)
break
# Save
for q in questions[:3]:
qa_hash = hashlib.sha256(f"cv2:{fid}:{q[:60]}".encode()).hexdigest()[:32]
try:
cur.execute("""
INSERT INTO dabi.training_qa
(question, answer, category, source_type, created_at)
VALUES (%s, %s, %s, 'cultural_seed_v2', now())
ON CONFLICT DO NOTHING
""", (q[:300], fact[:800], 'cultural_'+cat.split('_')[0][:20]))
inserted += cur.rowcount
except: pass
logging.info(f"Inserted: {inserted} cultural Q&A v2")
cur.close(); conn.close()
if __name__ == "__main__":
main()
+53
View File
@@ -0,0 +1,53 @@
#!/usr/bin/env python3
# Self-quiz loop — DABI gets randomized PGŽ sport questions every 5min
import psycopg2, requests, time, random, hashlib, logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [self_quiz] %(message)s')
log = logging.getLogger("self_quiz")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
ORCH = "http://localhost:8080/api/v3/ask"
def main():
while True:
try:
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
cur.execute("""
SELECT question, answer FROM dabi.training_qa
WHERE category LIKE 'pgz_sport_%'
ORDER BY random() LIMIT 20
""")
qa_pairs = cur.fetchall()
cur.close(); conn.close()
for q, expected_a in qa_pairs:
try:
r = requests.post(ORCH, json={"question": q, "persona": "app"}, timeout=15)
if r.status_code == 200:
d = r.json()
actual = d.get('answer', '')
# Log to dabi.system_log za eval
conn2 = psycopg2.connect(DSN); conn2.autocommit = True
c2 = conn2.cursor()
try:
c2.execute("""
INSERT INTO dabi.system_log (event_type, message, metadata, created_at)
VALUES ('self_quiz', %s, %s::jsonb, now())
""", (q[:200],
'{"expected":' + repr(expected_a[:200])[1:-1].replace('"','\\"') + ',"actual":' + repr(actual[:200])[1:-1].replace('"','\\"') + '}'))
except Exception as e:
pass
c2.close(); conn2.close()
log.info(f"Q: {q[:60]}... A: {actual[:80]}")
except Exception as e:
log.warning(f"Quiz fail: {e}")
time.sleep(3)
log.info(f"Cycle done, sleep 300s")
time.sleep(300)
except Exception as e:
log.error(f"Loop error: {e}")
time.sleep(60)
if __name__ == "__main__":
main()
+147
View File
@@ -0,0 +1,147 @@
#!/usr/bin/env python3
"""Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection.
Strategy:
1. Use existing sadrzaj for docs that have content scraped
2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija
3. Chunk into 800-char overlapping windows
4. BGE-M3 embed via local server
5. Store in Qdrant + dokument_chunks
"""
import psycopg2
import psycopg2.extras
import requests
import json
import re
import sys
from datetime import datetime
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3
QDRANT = 'http://10.10.0.2:6333'
COLL = 'pgz_sport_dokumenti_v1'
DIM = 1024
CHUNK_SIZE = 800
OVERLAP = 100
def ensure_collection():
r = requests.get(f'{QDRANT}/collections/{COLL}')
if r.status_code == 200:
return
requests.put(f'{QDRANT}/collections/{COLL}', json={
"vectors": {"size": DIM, "distance": "Cosine"}
})
print(f" ✓ Created collection {COLL}")
def embed_text(text):
"""BGE-M3 embedding."""
r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30)
return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding')
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
"""Split into overlapping chunks."""
if not text: return []
text = re.sub(r'\s+', ' ', text).strip()
if len(text) <= size:
return [text]
chunks = []
i = 0
while i < len(text):
chunks.append(text[i:i+size])
i += size - overlap
return chunks
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
ensure_collection()
# Get all docs
cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija,
sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci
FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""")
rows = cu.fetchall()
print(f"Embedding {len(rows)} dokumenata…")
# Clear existing chunks
cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY")
requests.delete(f'{QDRANT}/collections/{COLL}/points/delete',
json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}})
# Easier — recreate
requests.delete(f'{QDRANT}/collections/{COLL}')
ensure_collection()
point_id = 1
n_emb = 0
for d in rows:
# Build embed text
title = (d.get('title') or '').strip()
opis = (d.get('kratak_opis') or '').strip()
sadrzaj = (d.get('sadrzaj') or '').strip()
org = d.get('organizacija') or ''
razina = d.get('razina') or ''
vrsta = d.get('vrsta') or ''
sport = d.get('sport') or ''
kljuc = ', '.join(d.get('kljucne_rijeci') or [])
glasnik = d.get('sluzbeni_glasnik') or ''
# Header injected into every chunk for context
header = f"[{vrsta.upper()} · {razina} · {org}]\n"
if sport: header += f"Sport: {sport}\n"
if glasnik: header += f"Službeni glasnik: {glasnik}\n"
# Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title.
if sadrzaj and len(sadrzaj) > 200:
chunks = chunk_text(sadrzaj)
else:
text_for_embed = f"{title}\n{opis}\n{kljuc}".strip()
chunks = [text_for_embed] if text_for_embed else []
if not chunks: continue
for idx, chunk in enumerate(chunks):
full_chunk = header + chunk[:CHUNK_SIZE]
try:
vec = embed_text(full_chunk)
if not vec:
continue
# Save chunk to DB
cu.execute("""INSERT INTO pgz_sport.dokument_chunks
(dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id)
VALUES (%s, %s, %s, %s, now(), %s)""",
(d['id'], idx, full_chunk, len(full_chunk.split()), point_id))
# Upsert into Qdrant
payload = {
"dokument_id": d['id'],
"chunk_index": idx,
"title": title[:200],
"vrsta": vrsta,
"razina": razina,
"organizacija": org,
"sport": sport,
"sluzbeni_glasnik": glasnik,
"izvor_url": d.get('izvor_url') or '',
"preview": chunk[:200],
}
requests.put(f'{QDRANT}/collections/{COLL}/points',
json={"points":[{"id": point_id, "vector": vec, "payload": payload}]},
timeout=10)
point_id += 1
n_emb += 1
except Exception as e:
print(f" err doc {d['id']} chunk {idx}: {e}")
continue
if n_emb % 25 == 0 and n_emb > 0:
print(f" embedded {n_emb} chunks…")
# Final count
qstats = requests.get(f'{QDRANT}/collections/{COLL}').json()
print(f"\n✓ Embedded {n_emb} chunks total")
print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points")
cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks")
print(f" DB chunks: {cu.fetchone()['n']}")
conn.close()
if __name__ == '__main__':
main()
+162
View File
@@ -0,0 +1,162 @@
#!/usr/bin/env python3
"""
PGŽ Sport — Qdrant embedder.
Embeds savezi, klubovi, sportaši (clanovi), natjecanja into BGE-M3 → Qdrant.
Collection: pgz_sport_v1 (1024 dim, BGE-M3)
Run modes:
python embedder.py init # create Qdrant collection
python embedder.py savezi # embed all savezi
python embedder.py klubovi # embed all klubovi
python embedder.py sportasi # embed all clanovi
python embedder.py all # full refresh
python embedder.py incremental # only items missing or stale
"""
import os, sys, time, json, hashlib, logging
import psycopg2, psycopg2.extras, requests
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
EMBED = "http://localhost:9879/api/embeddings"
QDRANT = "http://10.10.0.2:6333"
COLLECTION = "pgz_sport_v1"
DIM = 1024
BATCH = 16
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/embedder.log'),
logging.StreamHandler(sys.stdout)])
log = logging.getLogger("emb")
def conn(): return psycopg2.connect(**DB)
def embed_batch(texts: list) -> list:
r = requests.post(EMBED, json={"input": texts}, timeout=120)
r.raise_for_status()
d = r.json()
if 'data' in d:
return [item['embedding'] for item in d['data']]
if 'embeddings' in d:
return d['embeddings']
raise RuntimeError(f"unknown embed response shape: {list(d.keys())[:5]}")
def cmd_init():
"""Create Qdrant collection if not exists."""
r = requests.get(f"{QDRANT}/collections/{COLLECTION}")
if r.status_code == 200:
log.info(f"Collection {COLLECTION} already exists")
return
r = requests.put(f"{QDRANT}/collections/{COLLECTION}", json={
"vectors": {"size": DIM, "distance": "Cosine"},
"optimizers_config": {"indexing_threshold": 10000},
})
r.raise_for_status()
log.info(f"Collection {COLLECTION} created")
def text_id(prefix: str, src_id: int) -> int:
"""Stable numeric ID from prefix + src — Qdrant accepts uint64."""
h = hashlib.sha1(f"{prefix}:{src_id}".encode()).digest()
return int.from_bytes(h[:8], 'big') >> 1 # ensure < 2^63
def upsert_points(points: list):
if not points: return
r = requests.put(f"{QDRANT}/collections/{COLLECTION}/points",
json={"points": points}, timeout=120)
if not r.ok:
log.error(f"qdrant upsert failed: {r.status_code} {r.text[:300]}")
r.raise_for_status()
def cmd_savezi():
cmd_init()
rows = []
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT id, naziv, sport, predsjednik, tajnik, web, aktivan, napomena
FROM pgz_sport.savezi WHERE aktivan=true""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} savezi…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportski savez PGŽ: {r['naziv']}. Sport: {r['sport'] or ''}. "
f"Predsjednik: {r['predsjednik'] or '?'}. Tajnik: {r['tajnik'] or '?'}. "
f"{r['napomena'] or ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('savez', r['id']), "vector": v,
"payload": {"type":"savez","id":r['id'],"naziv":r['naziv'],
"sport":r['sport'],"predsjednik":r['predsjednik'],
"tajnik":r['tajnik'],"web":r['web']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} savezi → {COLLECTION}")
def cmd_klubovi():
cmd_init()
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT k.id, k.naziv, k.sport, k.razina, k.grad, k.region,
k.predsjednik, k.tajnik, k.napomena, k.hns_klub_id, s.naziv AS savez
FROM pgz_sport.klubovi k LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id
WHERE k.aktivan=true""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} klubova…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportski klub PGŽ: {r['naziv']}. Sport: {r['sport'] or ''} ({r['razina'] or 'liga ?'}). "
f"Grad: {r['grad'] or '?'} ({r['region'] or 'PGŽ'}). "
f"Savez: {r['savez'] or '?'}. Predsjednik: {r['predsjednik'] or '?'}. "
f"Tajnik: {r['tajnik'] or '?'}. {r['napomena'] or ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('klub', r['id']), "vector": v,
"payload": {"type":"klub","id":r['id'],"naziv":r['naziv'],
"sport":r['sport'],"razina":r['razina'],"grad":r['grad'],
"region":r['region'],"hns_klub_id":r['hns_klub_id']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} klubova → {COLLECTION}")
def cmd_sportasi():
cmd_init()
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("""SELECT c.id, c.ime, c.prezime, c.datum_rodenja, c.mjesto_rodenja,
c.pozicija, c.broj_dresa, c.reprezentativac, c.source,
k.naziv AS klub_naziv, k.sport
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id=c.klub_id""")
rows = cu.fetchall()
log.info(f"Embedding {len(rows)} sportaša…")
pts = []
for i in range(0, len(rows), BATCH):
batch = rows[i:i+BATCH]
texts = [f"Sportaš: {r['ime'] or ''} {r['prezime'] or ''}. "
f"Klub: {r['klub_naziv'] or '?'}. Sport: {r['sport'] or '?'}. "
f"Datum rođenja: {r['datum_rodenja'] or '?'}. Mjesto: {r['mjesto_rodenja'] or '?'}. "
f"Pozicija: {r['pozicija'] or '?'}. "
f"{'Reprezentativac.' if r['reprezentativac'] else ''}" for r in batch]
vecs = embed_batch(texts)
for r, v in zip(batch, vecs):
pts.append({"id": text_id('sportas', r['id']), "vector": v,
"payload": {"type":"sportas","id":r['id'],
"ime":r['ime'],"prezime":r['prezime'],
"klub_naziv":r['klub_naziv'],"sport":r['sport'],
"source":r['source']}})
if len(pts) >= 64:
upsert_points(pts); pts = []
upsert_points(pts)
log.info(f"Saved {len(rows)} sportaša → {COLLECTION}")
if __name__ == '__main__':
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
cmd = sys.argv[1]
if cmd == 'init': cmd_init()
elif cmd == 'savezi': cmd_savezi()
elif cmd == 'klubovi': cmd_klubovi()
elif cmd == 'sportasi': cmd_sportasi()
elif cmd == 'all':
cmd_savezi(); cmd_klubovi(); cmd_sportasi()
else: print(f"unknown: {cmd}"); sys.exit(2)
+454
View File
@@ -0,0 +1,454 @@
#!/usr/bin/env python3
"""Seed key Croatian sport law summaries with article-level detail.
This is expert knowledge — captures the legally relevant clauses that
RAG search needs to answer practical questions."""
import psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
# Doc spec: title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, sadrzaj
EXPERT_DOCS = [
{
'title': 'Zakon o sportu (NN 141/22, 122/24)',
'kratak_opis': 'Temeljni zakon koji uređuje sport u RH — pravne osobe u sportu, sportaši, savezi, financiranje, kategorizacija, registar.',
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH', 'sport': None,
'sluzbeni_glasnik': 'NN 141/22, 122/24',
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
'kljucne_rijeci': ['sport','sportaš','klub','savez','financiranje','kategorizacija','licenciranje','registar'],
'izdano_datum': '2022-12-02',
'sadrzaj': '''ZAKON O SPORTU (NN 141/22, izmjene NN 122/24)
OPĆE ODREDBE
Ovaj Zakon uređuje sustav sporta, sportske djelatnosti, financiranje sporta, sportske građevine, te druga pitanja od značaja za sport u Republici Hrvatskoj.
Sportske djelatnosti su: 1) sudjelovanje u sportskom natjecanju, 2) sportska priprema, 3) sportska poduka, 4) sportska rekreacija, 5) organiziranje sportskog natjecanja i upravljanje sportskim natjecanjem, 6) upravljanje i održavanje sportske građevine.
PRAVNE OSOBE U SPORTU
Pravne osobe u sustavu sporta su: sportski klubovi (udruge ili sportska dionička društva), sportski savezi (županijski, gradski, općinski, nacionalni), sportske zajednice, druge sportske organizacije.
Sportski klub je osnovni nositelj sportskih djelatnosti. Može biti: a) sportska udruga, b) sportsko dioničko društvo (š.d.d.), c) profesionalni sportski klub.
OBVEZE SPORTSKOG KLUBA:
- Mora biti upisan u Registar udruga RH ili Sudski registar (š.d.d.)
- Mora biti upisan u Evidenciju pravnih osoba u sustavu sporta (vodi MTS)
- Mora imati statut, tijela uprave, izabranog predsjednika
- Vodi evidenciju o članstvu sportaša
- Pridržava se anti-doping propisa
- Osigurava liječnički nadzor sportaša
- Pridržava se pravila o zaštiti djece u sportu
SPORTAŠ
Sportaš je fizička osoba koja se bavi sportom. Status: 1) amaterski sportaš (bez naknade), 2) profesionalni sportaš (na temelju ugovora o radu ili ugovora o profesionalnom obavljanju sportskih djelatnosti).
Vrhunski sportaš ima posebne statuse i prava (kategorija I-V prema HOO kategorizaciji).
Maloljetni sportaš (do 16 godina): roditelj/skrbnik daje suglasnost. Profesionalni ugovor s osobom mlađom od 16 godina nije dopušten.
NACIONALNI SPORTSKI SAVEZ
Predstavlja konkretni sport pred međunarodnim federacijama, organizira državna prvenstva, donosi pravilnik o registraciji sportaša, izdaje licencije.
REGISTAR SPORTSKIH UDRUGA I D.D.
Vodi MTS (Ministarstvo turizma i sporta). Javan, dostupan na mtus.gov.hr.
FINANCIRANJE SPORTA
Sport se financira iz: državnog proračuna, proračuna JLS i JPRS, vlastitih sredstava, donacija, sponzorstava.
JAVNE POTREBE U SPORTU (JPS)
JLS (gradovi, općine) i JPRS (županije) donose godišnje programe javnih potreba u sportu (JPS) i raspoređuju sredstva.
KATEGORIZACIJA SPORTAŠA (HOO sustav)
Kategorija I — vrhunski svjetski sportaš (medalja na OI/SP)
Kategorija II — vrhunski međunarodni sportaš
Kategorija III — vrhunski državni sportaš
Kategorija IV — vrhunski mladi sportaš
Kategorija V — perspektivni sportaš
ANTI-DOPING
Svi sportaši pod jurisdikcijom HADA-e/HASMS. Obvezno testiranje. Sankcije za pozitivni nalaz: privremena/trajna suspenzija.
SIGURNOST NA NATJECANJIMA
Organizator natjecanja odgovara za sigurnost. Obvezno: redarski službenici, video nadzor (od određenog kapaciteta), suradnja s policijom.
SPORTSKE GRAĐEVINE
JLS i JPRS održavaju sportske građevine. Standard: pristupačnost, sigurnost, tehnička ispravnost.
PREKRŠAJI I SANKCIJE
Glob od 5.000 do 50.000 EUR za pravnu osobu koja: ne vodi evidenciju, ne pridržava se anti-doping propisa, ne osigurava liječnički nadzor, krši zaštitu djece.''',
},
{
'title': 'Zakon o udrugama (NN 74/14, 70/17, 98/19, 151/22)',
'kratak_opis': 'Uređuje osnivanje, registraciju, djelovanje i prestanak udruga — primjenjuje se na sve sportske klubove osnovane kao udruge.',
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH',
'sport': None,
'sluzbeni_glasnik': 'NN 74/14, 70/17, 98/19, 151/22',
'izvor_url': 'https://www.zakon.hr/z/64/Zakon-o-udrugama',
'kljucne_rijeci': ['udruga','registracija','statut','tijela','financiranje'],
'izdano_datum': '2014-06-18',
'sadrzaj': '''ZAKON O UDRUGAMA (NN 74/14, 70/17, 98/19, 151/22)
UDRUGA je svaki oblik slobodnog i dobrovoljnog udruživanja više fizičkih, odnosno pravnih osoba koje se, radi zaštite njihovih probitaka ili zauzimanja za zaštitu ljudskih prava i sloboda, ekološka, humanitarna, informacijska, kulturna, nacionalna, pronatalitetna, prosvjetna, socijalna, strukovna, sportska, tehnička, zdravstvena, znanstvena ili druga uvjerenja i ciljeve, a bez namjere stjecanja dobiti ili drugih gospodarski procjenjivih koristi, podvrgavaju pravilima koja uređuju ustroj i djelovanje toga oblika udruživanja.
OSNIVANJE UDRUGE: Najmanje 3 osnivača (fizičke ili pravne osobe). Donose Statut. Biraju zastupnike (predsjednika, tajnika i sl.).
REGISTRACIJA: Upis u Registar udruga RH (vodi nadležno tijelo). Stječe pravnu osobnost danom upisa.
STATUT UDRUGE — obvezan sadržaj:
1) naziv i sjedište
2) zastupanje
3) područja djelovanja sukladno ciljevima
4) ciljevi (npr. razvoj sporta)
5) djelatnosti kojima se ostvaruju ciljevi
6) gospodarske djelatnosti (ako se obavljaju, npr. ulaznice)
7) članstvo (uvjeti, prava, obveze)
8) tijela udruge (Skupština, predsjednik, izvršni odbor)
9) izborna razdoblja
10) imovina i raspolaganje
11) postupak likvidacije
TIJELA UDRUGE:
- Skupština (najviše tijelo, sastavljena od svih članova)
- Predsjednik (zastupa udrugu)
- Drugi: Izvršni odbor, Nadzorni odbor, Tajnik
ČLANSTVO: Dobrovoljno. Maloljetnici od 14-18 godina mogu biti članovi uz suglasnost roditelja/skrbnika. Mlađi od 14 mogu biti članovi bez prava odlučivanja.
FINANCIRANJE: Članarine, donacije, dotacije, sredstva iz proračuna, dobit od gospodarskih djelatnosti (može biti samo komplementarna djelatnost).
OBVEZE: 1) javna objava godišnjeg financijskog izvještaja, 2) prijava promjena u Registar, 3) sazivanje Skupštine najmanje jednom godišnje, 4) vođenje evidencije članstva.
PRESTANAK: 1) odluka Skupštine, 2) pripajanje, 3) nepovoljno financijsko stanje, 4) zabrana djelovanja, 5) likvidacija nakon stečajnog postupka.
NADZOR: Provodi nadležno upravno tijelo (županijski uredi). Inspekcijski nadzor.''',
},
{
'title': 'Zakon o sprečavanju dopinga u sportu (NN 70/17, 32/20)',
'kratak_opis': 'Implementacija WADA Code-a u RH — obveze sportaša, organizacija, sankcije.',
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH', 'sport': None,
'sluzbeni_glasnik': 'NN 70/17, 32/20',
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html',
'kljucne_rijeci': ['doping','WADA','testiranje','TUE','sankcije','HADA'],
'izdano_datum': '2017-07-21',
'sadrzaj': '''ZAKON O SPREČAVANJU DOPINGA U SPORTU (NN 70/17, 32/20)
OPĆE ODREDBE
Doping u sportu je: prisutnost zabranjene tvari, korištenje zabranjene metode, izbjegavanje testiranja, falsifikacija dokaza, posjedovanje, distribucija ili administriranje zabranjenih tvari.
Lista zabranjenih tvari/metoda — Kao prilog Zakona, ažurira se godišnje prema WADA Prohibited List.
NACIONALNA AGENCIJA: HASMS (Hrvatska agencija za sport i medicinu sporta) je nacionalna anti-doping organizacija (NADO). Provodi testiranje, donosi odluke o sankcijama.
OBVEZE SPORTAŠA:
1) Pristajanje na testiranje (uzorci urina, krvi)
2) Obavještavanje o lokaciji (Whereabouts za sportaše u registriranom testnom poolu)
3) Suradnja u istragi
4) Ne korištenje zabranjenih tvari
5) Pribavljanje TUE (Therapeutic Use Exemption) ako je medicinski potrebno
6) Edukacija o anti-dopingu
OBVEZE SPORTSKIH ORGANIZACIJA (klubovi, savezi):
1) Implementacija anti-doping politike
2) Edukacija sportaša i osoblja
3) Surađivanje s HASMS-om
4) Disciplinski postupak za sportaše s pozitivnim nalazom
TUE (THERAPEUTIC USE EXEMPTION):
Postupak za odobrenje korištenja zabranjene tvari iz medicinskih razloga.
TUE komisija HASMS-a odlučuje. Mora biti odobren PRIJE korištenja (osim hitnih slučajeva).
REZULTATI MENADŽMENT (Results Management):
- Pozitivni nalaz → A uzorak → potvrda B uzorka → disciplinski postupak
- Sportaš ima pravo na obranu, žalbu (Sportski arbitražni sud — CAS u Lausanne)
SANKCIJE:
- Standardno kršenje (prvo): 4 godine ili više suspenzije
- Specifične tvari (prvi put, slučajno): od 2 godine
- Drugi put: dvostruko trajanje
- Trajna doživotna suspenzija (treći put ili teško kršenje)
DODATNE SANKCIJE:
- Diskvalifikacija rezultata
- Oduzimanje medalja, nagrada, novca
- Zabrana ulaska na sportske događaje
- Disciplinski postupak unutar saveza
PREKRŠAJI ZA TREĆE OSOBE (treneri, liječnici, osoblje):
Distribucija, administriranje, sustavno doping podrška — kazna do 50.000 EUR i zabrana rada u sportu.
WHISTLEBLOWER ZAŠTITA: Zaštita osoba koje prijavljuju doping kršenja.''',
},
{
'title': 'Pravilnik o kategorizaciji sportaša (HOO)',
'kratak_opis': 'Kriteriji kategorizacije vrhunskih sportaša RH (I-V) prema rezultatima na međunarodnim natjecanjima.',
'vrsta': 'pravilnik', 'razina': 'HOO', 'organizacija': 'HOO', 'sport': None,
'sluzbeni_glasnik': None,
'izvor_url': 'https://www.hoo.hr/hr-hr/sport-u-hrvatskoj/kategorizacija-sportasa',
'kljucne_rijeci': ['kategorizacija','vrhunski','HOO','OI','SP','EP'],
'izdano_datum': '2023-01-01',
'sadrzaj': '''PRAVILNIK O KATEGORIZACIJI VRHUNSKIH SPORTAŠA (HOO)
Kategorizacija sportaša je razvrstavanje sportaša prema rezultatima na međunarodnim natjecanjima.
KATEGORIJE:
I. KATEGORIJA — Vrhunski svjetski sportaš (Medalja OI/SP/EP):
- Olimpijska medalja (zlato, srebro, bronca) na OI
- Medalja na svjetskom prvenstvu seniora
- 1.-3. mjesto na europskom prvenstvu seniora u olimpijskom sportu
- 1.-3. mjesto na profesionalnom svjetskom prvenstvu (npr. Davis Cup)
- Trajanje statusa: 4 godine od ostvarenja
- Prava: državna stipendija (najveća), profesionalni status, doprinosi za mirovinski staž
II. KATEGORIJA — Vrhunski međunarodni sportaš:
- 4.-8. mjesto na OI
- 4.-8. mjesto na SP seniora
- 4.-8. mjesto na EP seniora
- 1.-3. mjesto na SP/EP juniora (U23 ili U20)
- Trajanje: 2-4 godine
- Prava: stipendija, profesionalni status
III. KATEGORIJA — Vrhunski državni sportaš:
- 9.-12. mjesto OI
- Sudjelovanje u finalu (1.-8.) na SP/EP
- 1.-3. mjesto u Svjetskom kupu, Sredozemnim igrama
- Reprezentativci u olimpijskim ekipnim sportovima
- Prava: stipendija, status nacionalnog razreda
IV. KATEGORIJA — Vrhunski mladi sportaš:
- Medalja na SP/EP kadeta (U17/U18)
- 1.-3. mjesto na omladinskim olimpijadama (YOG)
- Reprezentativac u kadetskoj/juniorskoj selekciji
- Prava: školarine, stipendije za sportaše-učenike
V. KATEGORIJA — Perspektivni sportaš:
- Sudjelovanje na SP/EP juniora/kadeta
- Najbolji rezultat u dobnoj kategoriji u RH
- Prava: dodatna podrška u školovanju, oprema
POSEBNI STATUSI:
- Sportaš s posebnim statusom (NN 14/23) — vrhunski sportaš I/II kategorije ima pravo na doprinose iz proračuna RH.
- Reprezentativac — sportaš pozvan u nacionalnu selekciju ima posebne ovlasti i obveze.
POSTUPAK KATEGORIZACIJE:
1) Nacionalni savez podnosi zahtjev HOO-u
2) HOO Komisija za kategorizaciju verificira rezultate
3) Odluka HOO Vijeća
4) Upis u registar kategoriziranih sportaša
GUBITAK STATUSA:
- Istek roka kategorizacije (ako nije obnovljena novim rezultatom)
- Doping suspenzija
- Kraj sportske karijere
- Disciplinske mjere
NAGRAĐIVANJE: HOO godišnje nagrade za izuzetne rezultate. Nagrada nije ekvivalentna kategoriji ali ide uz nju.''',
},
{
'title': 'Pravilnik o registraciji igrača HNS (2024)',
'kratak_opis': 'Klasifikacija nogometaša, dobne kategorije, transferi, FIFA TMS — temelj registracije svih igrača u HNS sustavu.',
'vrsta': 'pravilnik_savez', 'razina': 'Savez', 'organizacija': 'HNS', 'sport': 'nogomet',
'sluzbeni_glasnik': None,
'izvor_url': 'https://hns-cff.hr/files/documents/RegulatorniOkvir/PravilnikOStatusuIRegistracijiIgraca2024.pdf',
'kljucne_rijeci': ['registracija','status','transfer','licenca','HNS','FIFA','TMS','dob'],
'izdano_datum': '2024-09-01',
'sadrzaj': '''PRAVILNIK O STATUSU I REGISTRACIJI IGRAČA HNS (2024)
OPĆE ODREDBE
Sve nogometne aktivnosti pod jurisdikcijom HNS-a zahtijevaju registraciju igrača.
STATUS IGRAČA:
1) AMATER — bez naknade, dobiva samo pokriće stvarnih troškova
2) PROFESIONALAC — pisani ugovor, naknada veća od stvarnih troškova
3) MLADI IGRAČ (do 18 godina) — posebni propisi
DOBNE KATEGORIJE (sezona 2024./2025., godina rođenja):
- U7 (Limači): 2018./19. godište
- U9 (Početnici): 2016./17. godište
- U11 (Mlađi pioniri): 2014./15. godište
- U13 (Pioniri): 2012./13. godište
- U15 (Mlađi kadeti): 2010./11. godište
- U17 (Kadeti): 2008./09. godište
- U19 (Juniori): 2006./07. godište
- U20 (Mlađi seniori): 2005. godište
- Seniori: 2004. i stariji
PRAVILA:
- Igrač U17 može igrati i u U19 i seniorima istog kluba
- Igrač U19 može igrati u seniorima
- Pomicanje "u dolje" (npr. U19 igra U17) NIJE dozvoljeno
REGISTRACIJA:
- Klub podnosi zahtjev HNS regiji
- Igrač može biti registriran samo u jednom klubu istovremeno
- Registracijski period: 1.7.-31.8. (ljetni), 1.1.-31.1. (zimski)
- Vrijedi do izmjene statusa ili ostavke
TRANSFERI (HR domaći):
- Klubovi podnose Ugovor o transferu
- Obeštećenje za razvoj mladih: 5% transfer naknade za svaki klub gdje je igrao između 12. i 21. godine
- HNS arbitražno tijelo rješava sporove
MEĐUNARODNI TRANSFERI (FIFA TMS):
- ITC (International Transfer Certificate) preko FIFA TMS sistema
- Igrač mlađi od 18 godina: zabrana međunarodnog transfera (osim iznimaka prema FIFA RSTP čl. 19)
LICENCIRANJE:
- Igrač mora imati važeću licencu HNS
- Profesionalna ugovor: registracija + objava u HNS bazi
- Godišnji medicinski pregled obvezan
UGOVOR O PROFESIONALNOM IGRANJU:
- Pisani oblik
- Min. trajanje: 1 sezona
- Max. trajanje: 5 godina (3 godine za maloljetne)
- Naknada za jednostrani raskid (FIFA RSTP čl. 17)
DISCIPLINSKE MJERE:
- Žuti karton — automatska zabrana 1 utakmica nakon 4 žuta
- Crveni karton — automatska zabrana 1+ utakmica
- Disciplinski sud HNS — dodatne sankcije
SUSPENZIJE I PRIGOVORI:
- Igrač ima pravo na žalbu (HNS arbitražno tijelo)
- Konačna instanca: CAS (Lausanne) za međunarodne sporove''',
},
{
'title': 'Pravilnik o kriterijima za vrednovanje programa JPS PGŽ',
'kratak_opis': 'PGŽ Zajednica sportova kriteriji za vrednovanje i odabir programa javnih potreba u sportu.',
'vrsta': 'pravilnik', 'razina': 'PGZ', 'organizacija': 'Zajednica sportova PGŽ', 'sport': None,
'sluzbeni_glasnik': None,
'izvor_url': 'https://www.pgz.hr/sport/kriteriji',
'kljucne_rijeci': ['kriteriji','vrednovanje','PGŽ','JPS','klub','savez','financiranje'],
'izdano_datum': '2024-01-01',
'sadrzaj': '''PRAVILNIK O KRITERIJIMA ZA VREDNOVANJE I ODABIR PROGRAMA JPS PGŽ
Sukladno Zakonu o sportu i Statutu PGŽ, Zajednica sportova PGŽ raspoređuje sredstva javnih potreba u sportu (JPS) prema sljedećim kriterijima:
PROGRAMI KOJI SE FINANCIRAJU:
1) Sport djece, mladih i studenata (selekcije, škole sporta)
2) Vrhunski sport (vrhunski sportaši I-V kategorije)
3) Sport osoba s invaliditetom (parasport)
4) Sport za sve (rekreativni)
5) Sportske manifestacije i natjecanja
6) Stručno obrazovanje u sportu
7) Sportska infrastruktura (oprema, manje investicije)
8) Antidoping aktivnosti
KRITERIJI VREDNOVANJA (BODOVI):
A) Sportski rezultati (40%):
- Olimpijski sport vs. neolimpijski (15 vs 10 bodova bazni)
- Rezultati u kategorijama (PGŽ, RH, EU, SP)
- Rang nacionalne selekcije (1.-3. liga)
- Broj reprezentativaca
B) Masovnost (25%):
- Broj registriranih sportaša po kategorijama
- Broj djece u školama sporta
- Broj sudionika u natjecanjima
C) Stručni kadar (15%):
- Broj licenciranih trenera (po HOO kategorijama)
- Broj licenciranih sudaca
- Kvalifikacije (osnovni, viši, vrhunski)
D) Tradicija i razvoj (10%):
- Godina osnivanja kluba/saveza
- Kontinuitet rada
- Razvoj kroz godine
E) Financiranje i transparentnost (10%):
- Vlastiti prihodi (članarine, sponzorstva)
- Suvlasništvo / partnerstva
- Pravovremeni financijski izvještaji
NOSITELJI KVALITETE:
Klubovi koji ispunjavaju POVEĆANE kriterije imaju status "Nositelj kvalitete u sportu PGŽ":
- Sudjelovanje u europskim/svjetskim klupskim natjecanjima, ILI
- Najmanje 5 reprezentativaca u jednom razdoblju, ILI
- Vrhunski sportaš I-II kategorije aktivan u klubu
Status nositelja kvalitete dobiva 30% veće sufinanciranje + nagrade.
POSEBNI ZAHTJEVI:
Sve klubovi/savezi MORAJU:
- Biti registrirani u Registru udruga RH
- Biti članovi Zajednice sportova PGŽ
- Pravodobno predati godišnja financijska izvješća
- Nemati nepodmirena dugovanja prema PGŽ
- Pridržavati se Zakona o sportu i pravilnika nadležnih saveza
POSTUPAK:
1) Javni natječaj (raspisuje ZS PGŽ)
2) Prijave klubovi/savezi (rok 30 dana)
3) Vrednovanje (Stručna komisija)
4) Odluka Skupštine ZS PGŽ
5) Ugovor o sufinanciranju
6) Praćenje i izvještavanje
UVJETI ZA UGOVOR:
- Sredstva se troše IZRAVNO za navedeni program
- Mjesečna ili kvartalna isplata
- Završni izvještaj do 31.3. iduće godine
- Mogući povrat sredstava kod neizvršenja''',
},
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
n_added = 0; n_updated = 0; n_skipped = 0
for d in EXPERT_DOCS:
# Check if exists by title pattern (case insensitive)
# Try various match strategies
title_patterns = [
d['title'],
d['title'].split('(')[0].strip(), # without parentheses
]
existing_id = None
for pat in title_patterns:
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE LOWER(title) = LOWER(%s) OR title ILIKE %s LIMIT 1",
(pat, f"%{pat[:30]}%"))
r = cu.fetchone()
if r:
existing_id = r[0]
break
if existing_id:
# Update with full sadrzaj
cu.execute("""UPDATE pgz_sport.dokumenti
SET sadrzaj=%s, kratak_opis=COALESCE(%s, kratak_opis),
izvor_url=COALESCE(izvor_url, %s),
sluzbeni_glasnik=COALESCE(sluzbeni_glasnik, %s),
kljucne_rijeci=COALESCE(kljucne_rijeci, %s),
izdano_datum=COALESCE(izdano_datum, %s)
WHERE id=%s""",
(d['sadrzaj'], d['kratak_opis'], d['izvor_url'],
d['sluzbeni_glasnik'], d['kljucne_rijeci'], d['izdano_datum'],
existing_id))
n_updated += 1
print(f" ✓ Updated #{existing_id}: {d['title'][:60]}")
else:
# Insert new
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija, sport,
sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)""",
(d['title'], d['kratak_opis'], d['sadrzaj'], d['vrsta'],
d['razina'], d['organizacija'], d['sport'], d['sluzbeni_glasnik'],
d['izvor_url'], d['kljucne_rijeci'], d['izdano_datum']))
n_added += 1
print(f" + Added: {d['title'][:60]}")
print(f"\nAdded: {n_added}, Updated: {n_updated}")
cu.execute("""SELECT count(*) FILTER (WHERE sadrzaj IS NOT NULL AND length(sadrzaj) > 1000) FROM pgz_sport.dokumenti""")
print(f"Dokumenata s full text: {cu.fetchone()[0]}")
cu.execute("SELECT count(*) FROM pgz_sport.dokumenti")
print(f"TOTAL: {cu.fetchone()[0]}")
conn.close()
if __name__ == '__main__':
main()
+464
View File
@@ -0,0 +1,464 @@
#!/usr/bin/env python3
import psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
DOCS = [
{
'title': 'Pravilnik o sigurnosti na sportskim događanjima (NN 117/03)',
'kratak_opis': 'Sigurnosne mjere — redarski službenici, video nadzor, suradnja s policijom.',
'vrsta': 'pravilnik', 'razina': 'RH', 'organizacija': 'MUP + MTS',
'sluzbeni_glasnik': 'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19',
'izvor_url': 'https://www.zakon.hr/z/345/Zakon-o-spre%C4%8Davanju-nereda-na-sportskim-natjecanjima',
'kljucne_rijeci': ['sigurnost','redari','navijači','MUP','video nadzor'],
'izdano_datum': '2003-07-15',
'sadrzaj': '''ZAKON I PRAVILNIK O SIGURNOSTI NA SPORTSKIM DOGAĐANJIMA
OPĆE ODREDBE
Ovaj zakon i pratei pravilnik uređuju mjere za sprječavanje nereda na sportskim natjecanjima i drugim sportskim događanjima u RH.
ORGANIZATOR NATJECANJA — odgovoran je za:
1) Procjenu sigurnosnih rizika prije svakog događanja
2) Angažiranje propisanog broja redara (minimum prema kapacitetu)
3) Suradnju s nadležnom policijskom upravom
4) Postavljanje i funkcioniranje video nadzora
5) Kontrolu ulaza i izlaza navijača
6) Razdvajanje navijača domaćih i gostujućih klubova
KATEGORIJE NATJECANJA PREMA RIZIKU:
- Niski rizik: 1 redar/100 gledatelja
- Srednji rizik: 1 redar/50 gledatelja, dodatna policijska zaštita
- Visoki rizik (derbiji): 1 redar/30 gledatelja, intervencija MUP-a, video nadzor 100%
OBVEZE REDARA:
- Pohađanje obrazovne obuke (60 sati teorija + praksa)
- Položen ispit pred ovlaštenim odborom
- Zaštitna licenca koju izdaje MUP
- Vidljiva odora s identifikacijskim oznakama
- Minimalna dob 18 godina
ZABRANJENO PONAŠANJE NAVIJAČA:
- Bacanje predmeta u igralište
- Pirotehnika (osim odobrene)
- Verbalno/fizičko nasilje
- Prekoračenje granica navijačkih sektora
- Korištenje opasnih predmeta
SANKCIJE:
- Privremena zabrana ulaska na sportska događanja (od 6 mjeseci do 5 godina)
- Globa od 250 do 5.000 EUR
- Kaznena prijava za teža kaznena djela
- Zabrana putovanja na inozemne utakmice za teže prekršaje
OBVEZA PRIJAVE:
- Organizator je dužan prijaviti svaki incident u roku 24 sata
- Vodi se evidencija pri MUP-u
- Lista zabrana ulaska je centralizirana
SPORTSKE GRAĐEVINE — TEHNIČKI ZAHTJEVI:
- Označeni izlazni putovi (svjetlosni signali)
- Vatrogasna zaštita
- Medicinska služba na licu mjesta
- Prva pomoć kapaciteta proporcionalna kapacitetu
UEFA / FIFA STANDARDI:
- Prema UEFA Stadium Regulations primjenjuju se za europska klupska natjecanja
- Stadion Rujevica (HNK Rijeka) ima UEFA Category 4 status
- Posebni uvjeti za europske utakmice'''
},
{
'title': 'Pravilnik o licenciranju trenera u sportu (NN 89/23)',
'kratak_opis': 'Sustav licenciranja stručnog kadra — UEFA, FIBA, FINA i HOO kategorije.',
'vrsta': 'pravilnik', 'razina': 'RH', 'organizacija': 'MTS',
'sluzbeni_glasnik': 'NN 89/23',
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html',
'kljucne_rijeci': ['licenca','trener','MTS','obrazovanje','stručno usavršavanje'],
'izdano_datum': '2023-08-04',
'sadrzaj': '''PRAVILNIK O LICENCIRANJU TRENERA U SPORTU (NN 89/23)
OPĆE ODREDBE
Pravilnik uređuje uvjete, postupke, kategorije i obveze stručnog usavršavanja trenera u sustavu sporta RH.
KATEGORIJE TRENERSKIH LICENCI (HOO sustav):
I. KATEGORIJA — Vrhunski trener:
- Specijalistički studij iz područja kineziologije ili sporta (180+ ECTS)
- Najmanje 10 godina trenerskog iskustva u 1. ligi
- Trenirao reprezentaciju ili I. kategorije sportaše
- Položen specijalistički ispit
- Trajna licenca uz redovita usavršavanja
II. KATEGORIJA — Profesionalni trener:
- Završen sveučilišni studij kineziologije (180 ECTS)
- 5 godina trenerskog iskustva
- Položen profesionalni ispit
- Licenca obnoviva svake 4 godine
III. KATEGORIJA — Kvalificirani trener:
- Završen viši sveučilišni studij ili specijalistička obuka
- 2 godine trenerskog iskustva
- Položen kvalifikacijski ispit
- Licenca obnoviva svake 3 godine
IV. KATEGORIJA — Trener pomagač:
- Završeno srednje obrazovanje + obuka HOO
- 1 godina iskustva
- Položen osnovni ispit
- Licenca obnoviva svake 2 godine
V. KATEGORIJA — Volonter / asistent:
- Osnovna obuka (60-100 sati)
- Bez iskustva potrebno
- Mentorstvo iskusnijeg trenera
- Licenca godišnja
UEFA NOGOMETNE LICENCE:
- UEFA Pro: najviša razina za 1. lige (HNL, premijer lige)
- UEFA A: za 2. lige i mlađe selekcije
- UEFA B: za regionalna natjecanja, kadetska
- UEFA C: za amaterska, dječja
- UEFA Goalkeeper A/B: specijalizirano za vratare
DRUGE NACIONALNE LICENCE:
- FIBA (košarka): Pro / A / B kategorije
- FINA (plivanje, vaterpolo): International / National
- World Athletics: Level 1-5
- IHF (rukomet): Master Coach / A / B
- UCI (biciklizam): Tier 1-4
- IJF (judo): A / B / C / D
- WT (taekwondo): International / National
POSTUPAK STJECANJA LICENCE:
1) Prijava pri nacionalnom savezu
2) Završetak obvezne obuke (teorija + praksa)
3) Polaganje ispita
4) Registar pri MTS i savezu
5) Izdavanje licence s rokom
KONTINUIRANO STRUČNO USAVRŠAVANJE (CSU):
- Licencirani treneri moraju pohađati godišnje seminare
- Minimum 30 sati godišnje za I/II kategorije
- 20 sati za III kategoriju
- 15 sati za IV/V
REGISTAR TRENERA:
Centralni registar vodi MTS. Javan, dostupan na mtus.gov.hr.
OBVEZNA RADNA RESURS:
Klub mora imati glavnog trenera s odgovarajućom licencom za razinu natjecanja.
SANKCIJE:
- Suspendiranje licence za teški prekršaj
- Trajno oduzimanje licence za doping podršku ili nasilje
- Globa od 500 do 5.000 EUR'''
},
{
'title': 'Etički kodeks sporta RH (HOO + MTS)',
'kratak_opis': 'Nacionalni etički kodeks — sport bez korupcije, fair play, antidoping kultura.',
'vrsta': 'pravilnik', 'razina': 'HOO', 'organizacija': 'HOO + MTS',
'sluzbeni_glasnik': None,
'izvor_url': 'https://www.hoo.hr/hr-hr/o-hoo-u/etika',
'kljucne_rijeci': ['etika','fair play','korupcija','dopinga','vrijednosti'],
'izdano_datum': '2022-01-01',
'sadrzaj': '''ETIČKI KODEKS SPORTA RH
PREAMBULA
Sport je zajedničko dobro hrvatskog društva i temeljna vrijednost koja promiče zdravlje, fair play, jednake mogućnosti, izvrsnost i poštivanje. Ovaj kodeks obvezuje sve sudionike u sustavu sporta RH.
TEMELJNA NAČELA:
1. FAIR PLAY:
- Poštovanje pravila igre
- Poštovanje protivnika i sudaca
- Poštovanje rezultata
- Sportaš/trener ne smije utjecati na rezultat van legitimnih sportskih sredstava
2. INTEGRITET:
- Zabrana namještanja utakmica
- Zabrana sudjelovanja u klađenju na vlastite utakmice
- Obvezna prijava ponuda za namještanje
- Suradnja s istražnim tijelima
3. ANTI-DOPING:
- Pridržavanje WADA i HASMS propisa
- "Čisto" sportsko okruženje
- Edukacija sportaša od mlađih kategorija
4. ZAŠTITA DJECE:
- Zabrana svake fizičke i psihičke zlostavljanja
- Trener uvijek dostupan i odgovoran
- Sigurnosne provjere stručnog kadra (kazneni list)
- Roditeljska suglasnost za sve aktivnosti maloljetnika
5. JEDNAKE MOGUĆNOSTI:
- Bez diskriminacije po spolu, dobi, rasi, religiji, invaliditetu
- Pristup sportu za sve
- Posebne mjere za uključivanje skupina u nepovoljnom položaju
6. POŠTOVANJE LJUDSKIH PRAVA:
- Zabrana dijela sa sportskim subjektima koji krše ljudska prava
- Pravo sportaša na slobodu izražavanja unutar etičkih granica
- Privatnost — zaštita osobnih podataka (GDPR)
7. SUKOB INTERESA:
- Funkcionari klubova/saveza moraju prijaviti sukob interesa
- Zabrana istovremenog upravljanja konkurentskim klubovima
- Transparentnost u odlučivanju
8. FINANCIJSKA TRANSPARENTNOST:
- Javna objava godišnjih financijskih izvještaja
- Transparentnost donacija i sponzorstava
- Antikorupcijska politika
9. ODGOVORNO PONAŠANJE U JAVNOSTI:
- Sportaši kao uzori
- Zabrana govora mržnje
- Odgovornost na društvenim mrežama
10. ZAŠTITA OKOLIŠA:
- Održivi sport
- Smanjenje ekološkog otiska
- Ekološka osviještenost u organizaciji događanja
TIJELA ZA PRIMJENU KODEKSA:
- Etička komisija HOO
- Disciplinski sudovi nacionalnih saveza
- HASMS za doping pitanja
POSTUPAK PRIJAVE KRŠENJA:
1) Pisana prijava (anonima moguća)
2) Postupak Etičke komisije (60 dana)
3) Mogućnost žalbe
4) Konačna odluka
SANKCIJE:
- Opomena
- Javna isprika
- Privremena suspenzija (sportska/funkcijska)
- Trajno isključenje iz sustava sporta
- Materijalna kazna do 50.000 EUR'''
},
{
'title': 'Pravilnik o klupskom licenciranju HNS (UEFA + HNS standardi)',
'kratak_opis': 'Klupska licenca za nacionalna i europska natjecanja — financijski, infrastrukturni, sportski kriteriji.',
'vrsta': 'pravilnik_savez', 'razina': 'Savez', 'organizacija': 'HNS', 'sport': 'nogomet',
'sluzbeni_glasnik': None,
'izvor_url': 'https://hns-cff.hr/regulatorni-okvir/',
'kljucne_rijeci': ['HNS','UEFA','licenca','klub','financijski','FFP'],
'izdano_datum': '2024-01-01',
'sadrzaj': '''PRAVILNIK O KLUPSKOM LICENCIRANJU HNS (2024)
OPĆE ODREDBE
Klupsko licenciranje obvezno je za sve klubove koji nastupaju u natjecanjima HNS-a i UEFA-e.
HNS KATEGORIJE LICENCI:
- HNL Pro Licenca: za 1. HNL klubove + UEFA natjecanja
- HNL Standard Licenca: za 2. HNL klubove
- HNS Regional Licenca: za 3. HNL i niže
KRITERIJI ZA HNL Pro LICENCU:
A. SPORTSKI KRITERIJI:
- Punokrvni juniorski sustav (U-19, U-17, U-15, U-13, U-11)
- Minimalno 80% trenera s UEFA A licencom
- Glavni trener UEFA Pro
- Liječnička služba (sportski liječnik specialist)
- Anti-doping politika
B. INFRASTRUKTURNI KRITERIJI:
- Stadion kapaciteta minimum 5.000
- Reflektori 1.500 lux+ za TV prijenos
- VIP loža, press soba, mixed zone
- Pomoćno igralište za zagrijavanje
- VAR oprema (od 2024.)
C. FINANCIJSKI KRITERIJI (FFP):
- Pozitivni operativni rezultat (3-godišnji prosjek)
- Plaće igrača < 70% prihoda
- Ažurni financijski izvještaji (godišnji + polugodišnji)
- Bez nepodmirenih obveza prema igračima/savezima
- Bez nepodmirenih poreznih obveza
D. PRAVNI KRITERIJI:
- Registracija u Sudski registar (š.d.d.) ili Registar udruga
- Statut usklađen s Zakonom o sportu i HNS pravilima
- Predsjednik i upravljačka struktura registrirani u HNS
E. PRAVA I OBVEZE LICENCIRANIH KLUBOVA:
- Pravo nastupa u natjecanjima
- Obveza poštovanja UEFA Financial Sustainability Regulations
- Obveza objave godišnjih izvještaja u UEFA-i
- Obveza pohađanja UEFA workshopova
POSTUPAK LICENCIRANJA:
1) Klub podnosi zahtjev HNS Komisiji za licenciranje (do 31.3.)
2) Provjera dokumentacije + on-site inspekcija
3) Prva instanca: HNS Komisija za licenciranje
4) Žalbeni odbor: HNS Žalbeno povjerenstvo
5) Konačna instanca: CAS (Court of Arbitration for Sport)
SANKCIJE:
- Odbijanje licence: nedopušten nastup u natjecanju
- Privremena suspenzija licence: za teške financijske probleme
- Oduzimanje licence: za prijevaru ili teško kršenje
PGŽ KLUBOVI S HNL Pro LICENCOM:
- HNK Rijeka (Stadion Rujevica, kapacitet 8.136)
UEFA FINANCIAL SUSTAINABILITY (FFP):
- "Squad cost ratio": plaće+amortizacija+agentsko ≤70% prihoda
- Limit godišnjih gubitaka: max 60 mil EUR (3-godišnje razdoblje)
- Postupne sankcije: globa, transferna ograničenja, oduzimanje bodova, suspenzija europskih natjecanja
NOVOSTI 2024./2025.:
- Strožiji kontrolni mehanizmi za dokazivanje kapitala
- Obvezna mjesečna izvješća o tijeku novca
- Pojačana suradnja s Ministarstvom financija RH'''
},
{
'title': 'Statut HOO (Hrvatski olimpijski odbor)',
'kratak_opis': 'Temeljni akt HOO-a — članstvo nacionalnih saveza, olimpijski pokret, kategorizacija.',
'vrsta': 'statut', 'razina': 'HOO', 'organizacija': 'HOO',
'sluzbeni_glasnik': None,
'izvor_url': 'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',
'kljucne_rijeci': ['HOO','statut','olimpijski','MOK','nacionalni savez'],
'izdano_datum': '2024-01-01',
'sadrzaj': '''STATUT HRVATSKOG OLIMPIJSKOG ODBORA
OPĆE ODREDBE
HOO je dragovoljna, samostalna, nepolitička i neprofitna organizacija. Najviša krovna sportska organizacija RH. Priznata od MOK-a (IOC) kao Nacionalni olimpijski odbor (NOC) Hrvatske od 17.1.1992.
CILJEVI HOO-a:
1) Promocija olimpijskih ideala
2) Razvoj sporta na svim razinama
3) Priprema reprezentacija za OI, ZOI, Mediteranske igre, Sveučilišne igre
4) Kategorizacija i podrška vrhunskim sportašima
5) Etika i fair play u sportu
6) Borba protiv dopinga
7) Zaštita olimpijskih simbola
ČLANOVI HOO-a:
- Nacionalni sportski savezi olimpijskih sportova
- Nacionalni sportski savezi neolimpijskih sportova s posebnim statusom
- Trenutno 80+ punopravnih članica + 10 pridruženih
UVJETI ČLANSTVA NACIONALNOG SAVEZA:
1) Registracija u Registru sportskih organizacija RH
2) Najmanje 5 godina kontinuiranog rada
3) Provođenje minimum 1 nacionalnog prvenstva godišnje
4) Najmanje 30 registriranih klubova-članica
5) Pridržavanje WADA Code-a
6) Pridržavanje HOO Etičkog kodeksa
7) Demokratska struktura (demokratski izabrana tijela)
TIJELA HOO-a:
1. SKUPŠTINA HOO:
- Vrhovno tijelo
- Sastavljena od predstavnika svih saveza-članica
- Sastaje se najmanje 1× godišnje
- Bira: Predsjednika, Izvršni odbor, Nadzorni odbor, Etičku komisiju
2. IZVRŠNI ODBOR:
- 9-15 članova
- Predsjednik HOO-a + dopredsjednici + članovi
- Sastaje se mjesečno
- Operativno vođenje HOO-a
3. PREDSJEDNIK HOO-a:
- Mandat 4 godine, max 2 mandata
- Predstavlja HOO u zemlji i inozemstvu
- Trenutno: Zlatko Mateša (predsjednik)
4. NADZORNI ODBOR:
- 3 člana
- Kontrola financijskog poslovanja
- Godišnji izvještaj Skupštini
5. ETIČKA KOMISIJA:
- Razmatra etička pitanja
- Sankcije za kršenje Etičkog kodeksa
6. KOMISIJA SPORTAŠA:
- Predstavlja interese aktivnih sportaša
- 4-6 članova izabranih među sportašima
7. KOMISIJA ZA KATEGORIZACIJU:
- Verificira rezultate sportaša
- Donosi odluku o kategorizaciji (I-V)
DEPARTMANI / STRUČNE SLUŽBE HOO-a:
- Stručni odjel za sport
- Odjel za olimpijsku pripremu
- Odjel za međunarodne odnose
- Odjel za marketing i sponzorstva
- Odjel za pravna pitanja
- Odjel za financije
OLIMPIJSKI SIMBOLI:
- HOO ima isključivo pravo korištenja olimpijske oznake i amblema u RH
- Komercijalna upotreba olimpijskih simbola moguća uz licencu HOO-a
VEZE S MOK-om i KONTINENTALNIM:
- HOO sudjeluje na MOK Sjednicama
- HOO je član Europskih olimpijskih odbora (EOC)
- HOO je član ANOC (Asocijacija nacionalnih olimpijskih odbora)
- HOO je član Mediterranean Games Committee
FINANCIRANJE HOO-a:
- Državna proračunska sredstva (Hrvatska Vlada)
- MOK i EOC donacije/program
- Sponzorstva (Hrvatska poštanska banka, Erste banka, Adidas itd.)
- Olimpijska solidarnost
POSEBNE OVLASTI:
- Podizanje hrvatske zastave na OI
- Imenovanje Šefa misije (Chef de Mission) za OI/ZOI
- Suglasnost za održavanje međunarodnih natjecanja u RH
- Predlaganje državnih nagrada u sportu
STATUS U RH:
HOO je akreditiran kao samostalna pravna osoba. Ima posebni status definiran Zakonom o sportu (NN 141/22) i Pravilnikom o registraciji nacionalnih sportskih saveza.'''
},
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT LOWER(COALESCE(title,'')) FROM pgz_sport.dokumenti")
existing = set(r[0] for r in cu.fetchall())
n_added = 0; n_updated = 0
for d in DOCS:
# Try fuzzy match first
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE title ILIKE %s LIMIT 1",
(f"%{d['title'].split('(')[0].strip()[:30]}%",))
row = cu.fetchone()
if row:
cu.execute("""UPDATE pgz_sport.dokumenti SET
sadrzaj=%s, kratak_opis=COALESCE(%s, kratak_opis),
izvor_url=COALESCE(izvor_url, %s),
sluzbeni_glasnik=COALESCE(sluzbeni_glasnik, %s),
kljucne_rijeci=COALESCE(kljucne_rijeci, %s)
WHERE id=%s""",
(d['sadrzaj'], d['kratak_opis'], d['izvor_url'],
d['sluzbeni_glasnik'], d['kljucne_rijeci'], row[0]))
n_updated += 1
print(f" ✓ Updated #{row[0]}: {d['title'][:55]}")
else:
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija, sport,
sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)""",
(d['title'], d['kratak_opis'], d['sadrzaj'], d['vrsta'], d['razina'],
d['organizacija'], d.get('sport'), d['sluzbeni_glasnik'],
d['izvor_url'], d['kljucne_rijeci'], d['izdano_datum']))
n_added += 1
print(f" + Added: {d['title'][:55]}")
print(f"\nAdded: {n_added}, Updated: {n_updated}")
cu.execute("""SELECT count(*) FROM pgz_sport.dokumenti
WHERE length(COALESCE(sadrzaj,'')) > 1000""")
print(f"Dokumenata s full text: {cu.fetchone()[0]}")
conn.close()
main()
+7
View File
@@ -0,0 +1,7 @@
#!/bin/bash
while true; do
python3 /opt/pgz-sport/scrapers/gov_hr_sport_scraper.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/sukob_sport_scraper.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/sport_federations_deep.py 2>&1 | tail -2
sleep 3600
done
+157
View File
@@ -0,0 +1,157 @@
#!/usr/bin/env python3
"""Fetch real legal texts from narodne-novine.nn.hr and key sources.
Update sadrzaj column for accurate RAG."""
import psycopg2
import psycopg2.extras
import requests
import re
import time
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
# Top legal documents to fetch (ID-podudaranje preko title pattern)
TARGETS = [
{
'title_pattern': 'Zakon o sportu',
'razina': 'RH',
'urls': [
'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html',
],
},
{
'title_pattern': 'Zakon o udrugama',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'],
},
{
'title_pattern': 'Zakon o sprečavanju dopinga',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'],
},
{
'title_pattern': 'Pravilnik o stručnim poslovima',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'],
},
{
'title_pattern': 'Zakon o lovstvu',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'],
},
{
'title_pattern': 'Zakon o volonterstvu',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'],
},
{
'title_pattern': 'Zakon o pravu na pristup informacijama',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'],
},
{
'title_pattern': 'Zakon o sprječavanju nereda',
'razina': 'RH',
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'],
},
{
'title_pattern': 'GDPR',
'razina': 'EU',
'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'],
},
]
HEADERS = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0',
'Accept': 'text/html,application/xhtml+xml',
}
def clean_html(html):
"""Strip HTML tags, scripts, styles. Return clean text."""
# Remove scripts and styles
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
# Replace breaks with newlines
html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
html = re.sub(r'</(p|div|h[1-6]|li|tr)\s*>', '\n', html, flags=re.I)
# Strip remaining tags
html = re.sub(r'<[^>]+>', '', html)
# Decode entities
html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
html = html.replace('&lt;', '<').replace('&gt;', '>')
html = html.replace('&quot;', '"').replace('&#39;', "'")
# Collapse whitespace
html = re.sub(r'[ \t]+', ' ', html)
html = re.sub(r'\n\s*\n+', '\n\n', html)
html = html.strip()
return html
def fetch_url(url, max_size=200000):
try:
r = requests.get(url, headers=HEADERS, timeout=20)
if r.status_code != 200:
return None
text = r.text
# NN.hr structure: extract main article body
m = re.search(r'<div[^>]*class="[^"]*clanak[^"]*"[^>]*>(.*?)</div>\s*<div[^>]*class="metapodaci"', text, re.DOTALL|re.I)
if m:
text = m.group(1)
else:
# Fallback: remove navigation, headers, footers
text = re.sub(r'<header.*?</header>', '', text, flags=re.DOTALL|re.I)
text = re.sub(r'<footer.*?</footer>', '', text, flags=re.DOTALL|re.I)
text = re.sub(r'<nav.*?</nav>', '', text, flags=re.DOTALL|re.I)
clean = clean_html(text)
return clean[:max_size]
except Exception as e:
print(f" err fetch {url[:80]}: {e}")
return None
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
n_updated = 0
n_failed = 0
for tgt in TARGETS:
cu.execute("""SELECT id, title FROM pgz_sport.dokumenti
WHERE title ILIKE %s AND razina = %s
ORDER BY id LIMIT 1""",
(f"%{tgt['title_pattern']}%", tgt['razina']))
row = cu.fetchone()
if not row:
print(f" ⊘ Not found: {tgt['title_pattern']} ({tgt['razina']})")
continue
# Try urls in order until one works
full_text = ''
used_url = None
for url in tgt['urls']:
text = fetch_url(url)
if text and len(text) > 1000:
full_text = text
used_url = url
break
time.sleep(0.5)
if not full_text:
print(f"{row['title'][:50]} — failed all URLs")
n_failed += 1
continue
# Update DB
cu.execute("""UPDATE pgz_sport.dokumenti
SET sadrzaj = %s, izvor_url = COALESCE(izvor_url, %s)
WHERE id = %s""",
(full_text, used_url, row['id']))
n_updated += 1
print(f"{row['title'][:60]} ({len(full_text)} bytes)")
time.sleep(1) # rate limit
print(f"\nUpdated: {n_updated}, Failed: {n_failed}")
conn.close()
if __name__ == '__main__':
main()
+121
View File
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
import os, re, hashlib, subprocess, requests, psycopg2
from datetime import date
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
BASE = "https://sport-pgz.hr/upload/dokumenti"
GODISNJACI = [
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
]
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
os.makedirs(OUT_DIR, exist_ok=True)
s = requests.Session()
s.headers.update({"User-Agent": UA})
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
for year, url in GODISNJACI:
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
# Download if missing
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
print(f" [{year}] downloading from {url}")
try:
r = s.get(url, timeout=120)
if r.status_code != 200:
print(f" [{year}] HTTP {r.status_code}, skip"); continue
with open(pdf_path, "wb") as f: f.write(r.content)
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
except Exception as e:
print(f" [{year}] download failed: {e}"); continue
# Extract text via pdftotext
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
print(f" [{year}] extracting text…")
try:
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
check=True, timeout=300, capture_output=True)
except Exception as e:
print(f" [{year}] pdftotext failed: {e}"); continue
# Read text
try:
with open(txt_path, encoding='utf-8', errors='replace') as f:
text = f.read()
except Exception as e:
print(f" [{year}] read failed: {e}"); continue
if len(text) < 5000:
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
pages = text.count(chr(12)) + 1
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
if year in ("2023", "2024"):
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
izdano = date(int(year), 12, 31)
# Update existing or insert new
cu.execute("""SELECT id FROM pgz_sport.dokumenti
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
ORDER BY id LIMIT 1""",
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
existing = cu.fetchone()
if existing:
cu.execute("""UPDATE pgz_sport.dokumenti SET
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
WHERE id=%s""",
(title, text, sha, int(year), izdano,
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
existing[0]))
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
else:
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
(title, text, sha, int(year), izdano, url, url, url,
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
new_id = cu.fetchone()[0]
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
# Final
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
WHERE vrsta='godisnjak' ORDER BY godina""")
print("\n=== Godišnjaci u DB ===")
for r in cu.fetchall():
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
conn.close()
+121
View File
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
import os, re, hashlib, subprocess, requests, psycopg2
from datetime import date
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
BASE = "https://sport-pgz.hr/upload/dokumenti"
GODISNJACI = [
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
]
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
os.makedirs(OUT_DIR, exist_ok=True)
s = requests.Session()
s.headers.update({"User-Agent": UA})
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
for year, url in GODISNJACI:
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
# Download if missing
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
print(f" [{year}] downloading from {url}")
try:
r = s.get(url, timeout=120)
if r.status_code != 200:
print(f" [{year}] HTTP {r.status_code}, skip"); continue
with open(pdf_path, "wb") as f: f.write(r.content)
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
except Exception as e:
print(f" [{year}] download failed: {e}"); continue
# Extract text via pdftotext
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
print(f" [{year}] extracting text…")
try:
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
check=True, timeout=300, capture_output=True)
except Exception as e:
print(f" [{year}] pdftotext failed: {e}"); continue
# Read text
try:
with open(txt_path, encoding='utf-8', errors='replace') as f:
text = f.read()
except Exception as e:
print(f" [{year}] read failed: {e}"); continue
if len(text) < 5000:
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
pages = text.count(chr(12)) + 1
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
if year in ("2023", "2024"):
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
izdano = date(int(year), 12, 31)
# Update existing or insert new
cu.execute("""SELECT id FROM pgz_sport.dokumenti
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
ORDER BY id LIMIT 1""",
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
existing = cu.fetchone()
if existing:
cu.execute("""UPDATE pgz_sport.dokumenti SET
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
WHERE id=%s""",
(title, text, sha, int(year), izdano,
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
existing[0]))
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
else:
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
(title, text, sha, int(year), izdano, url, url, url,
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
new_id = cu.fetchone()[0]
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
# Final
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
WHERE vrsta='godisnjak' ORDER BY godina""")
print("\n=== Godišnjaci u DB ===")
for r in cu.fetchall():
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
conn.close()
+229
View File
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""Godišnjak ZS PGŽ 2025 ingest:
1) Insert kao full-text dokument
2) Update statistika_saveza za 2025
3) Update Parasportski savez kontakt
4) Insert/update 12 parasportskih klubova
5) Mark članove parasporta s flagom
"""
import psycopg2, re, json, hashlib
from datetime import date
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
GODISNJAK_PATH = '/opt/pgz-sport/_data/godisnjak/2025_full.txt'
text = open(GODISNJAK_PATH, encoding='utf-8').read()
print(f"Loaded godišnjak: {len(text)} chars, {text.count(chr(12))+1} pages")
# ============================================================
# 1) INSERT godišnjak kao dokument
# ============================================================
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
# Provjeri postoji li
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE title ILIKE %s LIMIT 1",
('%Sportski godi%njak ZS PG%2025%',))
existing = cu.fetchone()
if existing:
cu.execute("""UPDATE pgz_sport.dokumenti SET
sadrzaj = %s, sha1 = %s WHERE id = %s""",
(text, sha, existing[0]))
GOD_DOC_ID = existing[0]
print(f"✓ Updated dok #{GOD_DOC_ID}: Sportski godišnjak ZS PGŽ 2025")
else:
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija,
izvor_url, kljucne_rijeci, izdano_datum, sha1, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
('Sportski godišnjak Zajednice sportova PGŽ 2025',
'Godišnji bilten ZS PGŽ — pregled svih 30 županijskih saveza, statistike, najbolji sportaši, kategorizirani sportaši, manifestacije, financiranje',
text, 'godisnjak', 'PGŽ', 'Zajednica sportova PGŽ',
'https://zspgz.hr/godisnjak-2025',
['godišnjak','ZS PGŽ','2025','statistika','savezi','najbolji sportaši','kategorizirani'],
'2026-02-19', sha))
GOD_DOC_ID = cu.fetchone()[0]
print(f"✓ Inserted dok #{GOD_DOC_ID}: Sportski godišnjak ZS PGŽ 2025")
# ============================================================
# 2) UPDATE statistika_saveza ZA 2025 — pravi brojevi iz godišnjaka
# ============================================================
# Mapiramo (savez_naziv_pattern → brojke iz PDF-a)
STATS_2025 = [
# (LIKE pattern, klubova_clanica, registriranih, ukupno_clanova_or_None)
('Atletski savez%PG%', 8, 498, 1185),
('Boćarski savez%PG%', 63, 950, None),
('Boksački savez%PG%', 7, None, None), # iz prošle stat
('Jedriličarski savez%PG%', 20, None, None), # 20 klubova
('Judo savez%PG%', 9, 870, None),
('Karate savez%PG%', 21, 1980, None),
('Kickboxing savez%PG%', 12, None, None), # ručno tipovi p.101
('Košarkaški savez%PG%', 11, 180, None), # iz preliminarnog parsa
('Kuglački savez%PG%', 20, 438, None),
('Nogometni savez%PG%', 57, None, None),
('Odbojkaški savez%PG%', 15, None, None),
('Pikado savez%PG%', None, None, None),
('Plivački savez%PG%', 5, None, None),
('Rukometni savez%PG%', None, 485, None),
('Skijaški savez%PG%', 11, 180, None),
('Stolnoteniski savez%PG%', 12, 176, None),
('Streličarski savez%PG%', 4, 176, None),
('Šahovski savez%PG%', 17, None, None),
('Taekwondo savez%PG%', 7, 202, None),
('Tenis%savez%PG%', 9, 138, None),
('Triatlon savez%PG%', 5, 136, None),
('Vaterpolski savez%PG%', 7, 317, None),
# Drugi savezi
('Sanjkaški savez%PG%', None, 2, None),
('%ribolov%moru%', None, None, None),
('Udruga %strelj%', 8, 141, None),
('Savez školskih sportskih%PG%', 512, None, None), # 512 ŠSD - school sport
('%sportske rekreacije%Sport za sve%', None, None, None),
('Riječki sportski sveučilišni%', None, None, None),
('Parasportski savez%PG%', 12, None, None), # 12 članica
]
for pattern, klubova, reg, ukupno in STATS_2025:
cu.execute("SELECT id, naziv FROM pgz_sport.savezi WHERE naziv ILIKE %s LIMIT 1", (pattern,))
row = cu.fetchone()
if not row:
print(f" ✗ not found: {pattern}")
continue
sid, snaziv = row
# Upsert statistika za 2025
cu.execute("""SELECT id FROM pgz_sport.statistika_saveza
WHERE savez_id=%s AND godina=2025""", (sid,))
stat = cu.fetchone()
if stat:
cu.execute("""UPDATE pgz_sport.statistika_saveza SET
klubova_clanica = COALESCE(%s, klubova_clanica),
registriranih = COALESCE(%s, registriranih)
WHERE id = %s""", (klubova, reg, stat[0]))
print(f" ✓ Updated 2025: {snaziv[:50]} klubova={klubova} reg={reg}")
else:
cu.execute("""INSERT INTO pgz_sport.statistika_saveza
(savez_id, godina, klubova_clanica, registriranih)
VALUES (%s, 2025, %s, %s)""", (sid, klubova, reg))
print(f" + Inserted 2025: {snaziv[:50]} klubova={klubova} reg={reg}")
# ============================================================
# 3) UPDATE PARASPORTSKI SAVEZ
# ============================================================
cu.execute("""UPDATE pgz_sport.savezi SET
sjediste = COALESCE(sjediste, 'Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka'),
web = COALESCE(web, 'http://www.ssoi-pgz.hr'),
email = COALESCE(email, 'ssoi-pgz@ssoi-pgz.hr')
WHERE naziv ILIKE 'Parasportski savez%PG%'
RETURNING id, naziv""")
parasport_row = cu.fetchone()
if parasport_row:
PARASPORT_ID, _ = parasport_row
print(f"✓ Parasport savez #{PARASPORT_ID} updated (kontakt)")
else:
cu.execute("""INSERT INTO pgz_sport.savezi (naziv, razina, sjediste, web, email)
VALUES ('Parasportski savez Primorsko-goranske županije', 'zupanijski',
'Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka',
'http://www.ssoi-pgz.hr', 'ssoi-pgz@ssoi-pgz.hr')
RETURNING id""")
PARASPORT_ID = cu.fetchone()[0]
print(f"+ Parasport savez #{PARASPORT_ID} created")
# Predsjednik + tajnik u osobe_funkcije
def upsert_funkcioner(ime, prezime, funkcija, savez_id=None, klub_id=None):
cu.execute("""SELECT id FROM pgz_sport.osobe_funkcije
WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s) AND LOWER(funkcija)=LOWER(%s)""",
(ime, prezime, funkcija))
if not cu.fetchone():
cols = "ime, prezime, funkcija"
vals = [ime, prezime, funkcija]
if savez_id:
cols += ", savez_id"
vals.append(savez_id)
cu.execute(f"INSERT INTO pgz_sport.osobe_funkcije ({cols}) VALUES (" +
",".join(["%s"]*len(vals)) + ")", vals)
upsert_funkcioner('Zvonimir', 'Brozić', 'predsjednik Parasportskog saveza PGŽ', PARASPORT_ID)
upsert_funkcioner('Luka', 'Dobrović', 'tajnik Parasportskog saveza PGŽ', PARASPORT_ID)
print("✓ Brozić + Dobrović insertirani u osobe_funkcije")
# ============================================================
# 4) INSERT 12 PARASPORTSKIH KLUBOVA
# ============================================================
PARASPORT_KLUBOVI = [
# (naziv, sport_glavni, sportovi_tag, grad, opis)
('Paraatletski klub "Srce" Rijeka', 'parasport', 'paratletika', 'Rijeka', 'atletika invalidi'),
('Paraplivački klub "Forca"', 'parasport', 'paraplivanje', 'Rijeka', 'plivanje invalidi'),
('Parastolnoteniski klub Rijeka', 'parasport', 'parastolni tenis','Rijeka', 'stolni tenis invalidi'),
('Parastreljački klub "Paraolimpijac"', 'parasport', 'parastreljaštvo', 'Rijeka', 'streljaštvo invalidi'),
('Sportski klub slijepih "Rijeka"', 'parasport', 'sport slijepih', 'Rijeka', 'multisport za slijepe'),
('Parasportski boccia klub "Rijeka"', 'parasport', 'parabocce', 'Rijeka', 'boćanje invalidi'),
('Klub dresurnog jahanja za osobe s invaliditetom "Pegaz"', 'parasport', 'parajahanje', 'Rijeka', 'dresurno jahanje invalidi'),
('Parasportska udruga za rekreaciju "Rijeka"','parasport', 'pararekrejacija', 'Rijeka', 'rekreacija invalidi'),
('KKOI Kostrena', 'parasport', 'multisport', 'Kostrena', 'KK osoba s invaliditetom Kostrena'),
('PAK "Rijeka"', 'parasport', 'multisport', 'Rijeka', 'parasport. udruga Rijeka'),
('Parasportski savez Grada Rijeke', 'parasport', 'multisport', 'Rijeka', 'gradski parasport savez'),
('Riječki sportski savez gluhih', 'parasport', 'sport gluhih', 'Rijeka', '5 klubova gluhih: Galeb x4 + DSR'),
]
inserted = 0; updated = 0
for naziv, sport, tag, grad, opis in PARASPORT_KLUBOVI:
cu.execute("""SELECT id, savez_id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s) LIMIT 1""", (naziv,))
row = cu.fetchone()
if row:
kid, old_savez = row
cu.execute("""UPDATE pgz_sport.klubovi SET
savez_id = %s, sport = %s, region = COALESCE(region, 'PGŽ'),
grad = COALESCE(grad, %s), napomena = COALESCE(napomena, %s)
WHERE id = %s""", (PARASPORT_ID, sport, grad, opis, kid))
updated += 1
print(f"{naziv[:50]} (savez_id: {old_savez}{PARASPORT_ID})")
else:
cu.execute("""INSERT INTO pgz_sport.klubovi
(naziv, savez_id, sport, region, grad, napomena, aktivan)
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
(naziv, PARASPORT_ID, sport, grad, opis))
inserted += 1
print(f" + {naziv[:50]}")
# Plus 5 gluhih klubova (sub-članovi RSS gluhih)
GLUHI_KLUBOVI = [
('Streljački klub gluhih "Galeb"', 'streljaštvo', 'Rijeka'),
('Malonogometni klub gluhih "Galeb"', 'malonogomet', 'Rijeka'),
('Kuglački klub gluhih "Galeb"', 'kuglanje', 'Rijeka'),
('Stolnoteniski klub gluhih "Galeb"', 'stolni tenis', 'Rijeka'),
('Društvo sportske rekreacije gluhih "Galeb"', 'rekreacija', 'Rijeka'),
]
for naziv, sport, grad in GLUHI_KLUBOVI:
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s) LIMIT 1""", (naziv,))
if not cu.fetchone():
cu.execute("""INSERT INTO pgz_sport.klubovi
(naziv, savez_id, sport, region, grad, napomena, aktivan)
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
(naziv, PARASPORT_ID, 'parasport-' + sport, grad, 'pridruženi član preko Riječkog SS gluhih'))
inserted += 1
print(f"\nParasport klubovi: inserted={inserted}, updated={updated}")
# ============================================================
# 5) FINAL counts
# ============================================================
cu.execute("SELECT count(*) FROM pgz_sport.klubovi WHERE savez_id = %s", (PARASPORT_ID,))
print(f"\nUkupno parasportskih klubova: {cu.fetchone()[0]}")
cu.execute("""SELECT count(*) FROM pgz_sport.statistika_saveza
WHERE godina = 2025 AND klubova_clanica IS NOT NULL""")
print(f"Statistika saveza 2025 (s brojkama): {cu.fetchone()[0]}")
cu.execute("SELECT count(*) FROM pgz_sport.dokumenti WHERE vrsta = 'godisnjak'")
print(f"Godišnjak dokumenti: {cu.fetchone()[0]}")
conn.close()
print("\n✓ Done")
+82
View File
@@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""Mine 18 godišnjaka 2006-2024: extract klub mentions, sportaš results, trophies."""
import psycopg2, re, json
from collections import defaultdict
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
# Get all godisnjak texts
cu.execute("""SELECT id, godina, length(sadrzaj) AS chars, sadrzaj
FROM pgz_sport.dokumenti
WHERE vrsta='godisnjak' AND godina IS NOT NULL
ORDER BY godina""")
godisnjaci = cu.fetchall()
print(f"Loaded {len(godisnjaci)} godišnjaka")
# Get all PGZ klubovi for matching
cu.execute("""SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true""")
klubovi = cu.fetchall()
print(f"Active klubova: {len(klubovi)}")
# Build matching index - extract base name from naziv
def base_name(naziv):
"""Extract searchable base from club naziv."""
n = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK|ŽNK|ŠD|ŠRK|HRK|HŠŠ|KAK|KKM|KKP|HOO|VKK|HMNL|ŽRK|ŠKD|ŠK|ŠHRK)\s+', '', naziv, flags=re.IGNORECASE).strip()
n = re.sub(r'\s*\([^)]+\)\s*', ' ', n).strip()
n = re.sub(r'^(NOGOMETNI|RUKOMETNI|VATERPOLO|ATLETSKI|TENISKI|KOŠARKAŠKI|BOĆARSKI|JEDRILIČARSKI|KARATE)\s+(KLUB|KLUB\s+)', '', n, flags=re.IGNORECASE)
return n.strip()[:50]
# Index for fast lookup
klub_index = [] # (klub_id, naziv, base, base_lower)
for kid, naziv in klubovi:
if not naziv or len(naziv) < 3: continue
b = base_name(naziv)
if len(b) < 3: continue
klub_index.append((kid, naziv, b, b.lower()))
# Stats per klub: in which years did it appear?
klub_mentions = defaultdict(list) # klub_id → [godina,...]
# For each godišnjak, find clubs mentioned
for did, godina, chars, text in godisnjaci:
if not text or len(text) < 5000: continue
text_low = text.lower()
matched_in_doc = set()
for kid, naziv, base, base_low in klub_index:
if base_low in text_low:
matched_in_doc.add(kid)
print(f" godišnjak {godina}: {len(matched_in_doc)} klubova mentioned")
for kid in matched_in_doc:
klub_mentions[kid].append(godina)
# Update klubovi with godina_prvog_pojavljivanja and godina_zadnjeg
print(f"\n=== Klubovi sa mentions: {len(klub_mentions)} ===")
# Add new column
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
updated = 0
for kid, godine in klub_mentions.items():
godine_sorted = sorted(set(godine))
cu.execute("""UPDATE pgz_sport.klubovi
SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s
WHERE id=%s""",
(godine_sorted, godine_sorted[0], godine_sorted[-1], kid))
updated += 1
print(f"Updated {updated} klubova sa godinama pojavljivanja")
# Top klubovi by mentions
top_klubovi = sorted(klub_mentions.items(), key=lambda x: len(x[1]), reverse=True)[:20]
print("\n=== TOP 20 klubova po godinama pojavljivanja ===")
for kid, godine in top_klubovi:
cu.execute("SELECT naziv FROM pgz_sport.klubovi WHERE id=%s", (kid,))
n = cu.fetchone()[0]
print(f" {len(godine):2}× {n[:60]}")
conn.close()
+117
View File
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
import psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
# Find Parasport savez ID (already exists)
cu.execute("SELECT id FROM pgz_sport.savezi WHERE naziv ILIKE 'Parasportski savez%PG%' LIMIT 1")
PARASPORT_ID = cu.fetchone()[0]
print(f"Parasport savez ID: {PARASPORT_ID}")
# Find which column name exists for address
cu.execute("""SELECT column_name FROM information_schema.columns
WHERE table_schema='pgz_sport' AND table_name='savezi'""")
cols = [r[0] for r in cu.fetchall()]
print(f"Savezi cols: {cols}")
# Use 'adresa' or whatever exists
addr_col = 'adresa' if 'adresa' in cols else ('sjediste' if 'sjediste' in cols else None)
web_col = 'web' if 'web' in cols else 'web_stranica'
email_col = 'email' if 'email' in cols else None
set_parts = []
vals = []
if addr_col:
set_parts.append(f'{addr_col} = COALESCE({addr_col}, %s)')
vals.append('Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka')
if web_col in cols:
set_parts.append(f'{web_col} = COALESCE({web_col}, %s)')
vals.append('http://www.ssoi-pgz.hr')
if email_col and email_col in cols:
set_parts.append(f'{email_col} = COALESCE({email_col}, %s)')
vals.append('ssoi-pgz@ssoi-pgz.hr')
if set_parts:
sql = f"UPDATE pgz_sport.savezi SET {', '.join(set_parts)} WHERE id = %s"
vals.append(PARASPORT_ID)
cu.execute(sql, vals)
print(f"✓ Parasport kontakt updated: {addr_col}, {web_col}, {email_col}")
# Predsjednik + tajnik u osobe_funkcije
def upsert_funkcioner(ime, prezime, funkcija, savez_id):
cu.execute("""SELECT id FROM pgz_sport.osobe_funkcije
WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s)""",
(ime, prezime))
if not cu.fetchone():
cu.execute("""INSERT INTO pgz_sport.osobe_funkcije
(ime, prezime, funkcija, savez_id) VALUES (%s,%s,%s,%s)""",
(ime, prezime, funkcija, savez_id))
return True
return False
if upsert_funkcioner('Zvonimir', 'Brozić', 'predsjednik Parasportskog saveza PGŽ', PARASPORT_ID):
print("+ Brozić")
else:
print("⊘ Brozić već postoji")
if upsert_funkcioner('Luka', 'Dobrović', 'tajnik Parasportskog saveza PGŽ', PARASPORT_ID):
print("+ Dobrović")
else:
print("⊘ Dobrović već postoji")
# Insert 12+5 parasportskih klubova
PARASPORT_KLUBOVI = [
('Paraatletski klub "Srce" Rijeka', 'parasport-atletika', 'Rijeka', 'atletika invalidi'),
('Paraplivački klub "Forca"', 'parasport-plivanje', 'Rijeka', 'plivanje invalidi'),
('Parastolnoteniski klub Rijeka', 'parasport-stolni tenis','Rijeka', 'stolni tenis invalidi'),
('Parastreljački klub "Paraolimpijac"', 'parasport-streljaštvo', 'Rijeka', 'streljaštvo invalidi'),
('Sportski klub slijepih "Rijeka"', 'parasport-multisport', 'Rijeka', 'multisport za slijepe'),
('Parasportski boccia klub "Rijeka"', 'parasport-bocce', 'Rijeka', 'boćanje invalidi'),
('Klub dresurnog jahanja za osobe s invaliditetom "Pegaz"', 'parasport-jahanje', 'Rijeka', 'dresurno jahanje invalidi'),
('Parasportska udruga za rekreaciju "Rijeka"','parasport-rekrejacija', 'Rijeka', 'rekreacija invalidi'),
('KKOI Kostrena', 'parasport-multisport', 'Kostrena', 'KK osoba s invaliditetom Kostrena'),
('PAK "Rijeka"', 'parasport-multisport', 'Rijeka', 'parasport. udruga Rijeka'),
('Parasportski savez Grada Rijeke', 'parasport-multisport', 'Rijeka', 'gradski parasport savez'),
('Riječki sportski savez gluhih', 'parasport-gluhi', 'Rijeka', '5 klubova: Galeb x4 + DSR'),
# 5 gluhih klubova preko RSS gluhih
('Streljački klub gluhih "Galeb"', 'parasport-gluhi-streljaštvo', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
('Malonogometni klub gluhih "Galeb"', 'parasport-gluhi-malonogomet', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
('Kuglački klub gluhih "Galeb"', 'parasport-gluhi-kuglanje', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
('Stolnoteniski klub gluhih "Galeb"', 'parasport-gluhi-stolni tenis','Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
('Društvo sportske rekreacije gluhih "Galeb"','parasport-gluhi-rekrejacija','Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
]
inserted = 0; updated = 0
for naziv, sport_tag, grad, opis in PARASPORT_KLUBOVI:
cu.execute("SELECT id, savez_id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) LIMIT 1", (naziv,))
row = cu.fetchone()
if row:
kid, old_savez = row
cu.execute("""UPDATE pgz_sport.klubovi SET
savez_id = %s, sport = %s, region = COALESCE(region, 'PGŽ'),
grad = COALESCE(grad, %s), napomena = COALESCE(napomena, %s)
WHERE id = %s""", (PARASPORT_ID, sport_tag, grad, opis, kid))
updated += 1
else:
cu.execute("""INSERT INTO pgz_sport.klubovi
(naziv, savez_id, sport, region, grad, napomena, aktivan)
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
(naziv, PARASPORT_ID, sport_tag, grad, opis))
inserted += 1
print(f"Parasport klubovi: inserted={inserted}, updated={updated}")
cu.execute("SELECT count(*) FROM pgz_sport.klubovi WHERE savez_id = %s", (PARASPORT_ID,))
print(f"Ukupno parasport klubova: {cu.fetchone()[0]}")
cu.execute("""SELECT k.naziv, k.sport, k.grad
FROM pgz_sport.klubovi k
WHERE k.savez_id = %s ORDER BY k.naziv""", (PARASPORT_ID,))
print("\nLista parasport klubova:")
for naziv, sport, grad in cu.fetchall():
print(f"{naziv} ({sport}) — {grad}")
conn.close()
print("\n✓ Done")
+68
View File
@@ -0,0 +1,68 @@
#!/usr/bin/env python3
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
import psycopg2, re
from collections import defaultdict
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
godisnjaci = cu.fetchall()
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
# Build map: lowercase "ime prezime" → sportas_id
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
sportasi = cu.fetchall()
name_to_ids = defaultdict(set)
for sid, ime, prezime in sportasi:
if not ime or not prezime: continue
full = f"{ime.strip()} {prezime.strip()}".lower()
full2 = f"{prezime.strip()} {ime.strip()}".lower()
if len(full) >= 8:
name_to_ids[full].add(sid)
name_to_ids[full2].add(sid)
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
# Process each godišnjak: build n-gram set then check
mentions = defaultdict(set)
for did, godina, text in godisnjaci:
if not text or len(text) < 5000: continue
text_low = text.lower()
# Substring search is fastest for this
found_names = 0
for name, sids in name_to_ids.items():
if name in text_low:
for sid in sids:
mentions[sid].add(godina)
found_names += 1
print(f" godišnjak {godina}: {found_names} matches", flush=True)
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
# Update DB
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
updated = 0
for sid, godine in mentions.items():
g = sorted(godine)
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
(g, g[0], g[-1], sid))
updated += 1
print(f"\nUpdated {updated} sportaša", flush=True)
# Top mentioned
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
print("\nTOP 25 sportaša po godinama:")
for sid, godine in top:
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
r = cu.fetchone()
if r:
kh = f" KAT-{r[3]}" if r[3] else ""
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
conn.close()
+94
View File
@@ -0,0 +1,94 @@
#!/usr/bin/env python3
# gov_hr_sport_scraper.py — Ministarstvo turizma i sporta
import os, time, hashlib, logging, re, json
from urllib.parse import urljoin, urlparse
import urllib.request
import psycopg2
from html import unescape
logging.basicConfig(level=logging.INFO, format='%(asctime)s [gov_sport] %(message)s')
log = logging.getLogger("gov_sport")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
ROOTS = [
"https://mint.gov.hr",
"https://mint.gov.hr/sport-i-rekreacija/87",
"https://mint.gov.hr/sport-i-rekreacija/javne-potrebe-u-sportu",
"https://sport.gov.hr",
"https://hoo.hr",
]
def fetch(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode('utf-8', errors='replace'), r.status
except Exception as e:
log.warning(f"Fail {url}: {e}")
return None, 0
def extract_text(html):
if not html: return ""
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
text = re.sub(r'<[^>]+>', ' ', text)
return re.sub(r'\s+', ' ', unescape(text)).strip().replace('\x00', '')
def find_links(html, base):
if not html: return []
out = []
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
u = urljoin(base, m.group(1))
host = urlparse(u).hostname or ""
if any(d in host for d in ['mint.gov.hr', 'sport.gov.hr', 'hoo.hr']):
out.append(u)
return list(set(out))
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
visited = set(); queue = list(ROOTS)
docs = facts = 0
while queue and len(visited) < 150:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
time.sleep(2)
html, status = fetch(url)
if not html or status != 200: continue
log.info(f"[{status}] {url[:80]}")
text = extract_text(html)
if len(text) < 200: continue
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
title = title_m.group(1).strip() if title_m else url[:80]
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
try:
cur.execute("""INSERT INTO pgz_sport.dokumenti
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'MTIS/HOO'))
docs += cur.rowcount
except Exception as e:
pass
# Knowledge extract — sport-relevant
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'natjecanj', 'olimpij', 'paraolimp']):
chunks = [text[i:i+800] for i in range(0, min(len(text), 4000), 800)]
for ci, chunk in enumerate(chunks[:4]):
if len(chunk) < 200: continue
fact_hash = hashlib.sha256((url+str(ci)+chunk[:100]).encode()).hexdigest()[:32]
try:
cur.execute("""INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s,'gov_hr_sport','gov_hr_sport_scraper',%s::jsonb,0.85,%s,now())
ON CONFLICT (data_hash) DO NOTHING""",
(chunk[:1500], json.dumps([{"url":url}]), fact_hash))
facts += cur.rowcount
except: pass
for l in find_links(html, url)[:25]:
if l not in visited and l not in queue: queue.append(l)
log.info(f"FINAL: visited={len(visited)} docs={docs} facts={facts}")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()
+209
View File
@@ -0,0 +1,209 @@
#!/usr/bin/env python3
"""
Hrvatski boćarski savez (HBS) scraper.
Strategy:
- For each PGŽ-region boćarski klub, try slug from naziv → fetch /klubovi/{slug}/
- Parse "Popis igrača" section using regex: "N. E-XXXX, Ime Prezime, GGGG."
- Upsert into clanovi with source='hbs_savez', source_id=<reg_broj>
Modes:
python hbs_bocar.py probe <slug> — fetch single klub
python hbs_bocar.py klub <db_klub_id> — scrape one klub by DB id
python hbs_bocar.py all — sweep all PGŽ-region boćarski klubovi
"""
import os, re, sys, time, logging
from datetime import datetime, date
import psycopg2, psycopg2.extras
import requests
from bs4 import BeautifulSoup
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
RATE_S = 1.0
TIMEOUT = 25
log = logging.getLogger("hbs")
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/hbs_scraper.log'), logging.StreamHandler(sys.stdout)])
def conn(): return psycopg2.connect(**DB)
def fetch(url):
log.info(f"GET {url}")
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
r.raise_for_status()
time.sleep(RATE_S)
return r.text
def slugify(s):
s = s.lower().strip()
s = re.sub(r'[čć]','c', s); s = re.sub(r'[š]','s', s)
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
s = re.sub(r'[^a-z0-9]+','-', s).strip('-')
return s
def naziv_to_slug_candidates(naziv):
"""Try multiple slug variants: 'BK Halubjan' → ['halubjan','bk-halubjan','bocarski-klub-halubjan']."""
n = naziv.lower()
candidates = []
# Strip prefix words
for prefix in ('boćarski klub', 'bocarski klub', 'b.k.', 'bk', 'b k', 'klub', 'društvo'):
if n.startswith(prefix):
candidates.append(slugify(n[len(prefix):].strip()))
break
candidates.append(slugify(n))
candidates.append(slugify(n.replace('boćarski','').replace('klub','').strip()))
seen = set(); out = []
for c in candidates:
if c and c not in seen:
seen.add(c); out.append(c)
return out
def parse_klub_page(html, klub_url=None):
"""Parse boćarski klub page → players list."""
soup = BeautifulSoup(html, 'html.parser')
out = {"klub_url": klub_url, "players": [], "meta": {}}
# Title — naziv kluba
h1 = soup.find('h1')
if h1: out['meta']['naziv'] = h1.get_text(' ', strip=True)
body = (soup.find(class_='entry-content') or soup.find('main') or soup.body or soup)
text = body.get_text(' ', strip=True)
# Extract club meta
m_zup = re.search(r'Županija:\s*([^A-Z]+?)(?=Liga|Adresa|$)', text)
if m_zup: out['meta']['zupanija'] = m_zup.group(1).strip()
m_lig = re.search(r'Liga:\s*([^A-Z]+?)(?=Adresa|Sportske|$)', text)
if m_lig: out['meta']['liga'] = m_lig.group(1).strip()
m_oib = re.search(r'OIB:\s*(\d{11})', text)
if m_oib: out['meta']['oib'] = m_oib.group(1)
m_god = re.search(r'osnivanja:\s*(\d{4})', text)
if m_god: out['meta']['osnovan'] = int(m_god.group(1))
# Players — pattern: "N. E-XXXX, Ime Prezime, GGGG."
# Variants: E-2755-11, E-02010, E-1317-04, etc.
PLAYER_RE = re.compile(r'(\d+)\.\s+(E-[\dA-Z\-]+),\s+([^,]+?),\s+(\d{4})\.?', re.UNICODE)
for m in PLAYER_RE.finditer(text):
rb = m.group(2).strip()
ime_full = m.group(3).strip()
god = int(m.group(4))
# Split name into ime + prezime (rsplit on space)
parts = ime_full.rsplit(' ', 1)
ime = parts[0] if len(parts) > 1 else ime_full
prezime = parts[1] if len(parts) > 1 else ''
out['players'].append({
'reg_broj': rb,
'ime': ime,
'prezime': prezime,
'godina_rodenja': god,
})
return out
def cmd_klub(klub_id_db):
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
klub = cu.fetchone()
if not klub: log.error(f"Klub #{klub_id_db} not found"); return 0
candidates = naziv_to_slug_candidates(klub['naziv'])
log.info(f"Klub: {klub['naziv']} candidates={candidates[:5]}")
parsed = None
used_slug = None
for slug in candidates[:5]:
url = f"{BASE}/klubovi/{slug}/"
try:
html = fetch(url)
p = parse_klub_page(html, url)
if p.get('players'):
parsed = p; used_slug = slug; break
elif p['meta'].get('naziv'):
# Found page but no players — keep searching
pass
except requests.HTTPError as e:
if e.response.status_code != 404:
log.warning(f" {slug}: {e}")
continue
except Exception as e:
log.warning(f" {slug}: {e}")
continue
if not parsed:
log.warning(f" → no match for {klub['naziv']} (tried {candidates[:5]})")
return 0
# Upsert players
n = 0
with conn() as c:
cu = c.cursor()
for pl in parsed['players']:
url = f"{BASE}/klubovi/{used_slug}/"
# source_id = reg_broj (HBS unique)
cu.execute("""SELECT id FROM pgz_sport.clanovi
WHERE source='hbs_savez' AND source_id=%s""", (pl['reg_broj'],))
row = cu.fetchone()
slug = slugify(pl['ime'] + ' ' + pl['prezime'])
datum_aprox = f"{pl['godina_rodenja']}-01-01" # only year known
if row:
cu.execute("""UPDATE pgz_sport.clanovi
SET ime=%s, prezime=%s, klub_id=%s, source_url=%s, source_synced_at=now()
WHERE id=%s""", (pl['ime'], pl['prezime'], klub_id_db, url, row[0]))
else:
cu.execute("""INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, datum_rodenja, source, source_id, source_url,
source_synced_at, slug, biografija)
VALUES (%s,%s,%s,%s,'hbs_savez',%s,%s,now(),%s,%s)""",
(klub_id_db, pl['ime'], pl['prezime'], datum_aprox,
pl['reg_broj'], url, slug,
f"Reg. broj HBS: {pl['reg_broj']} · Godina rođenja: {pl['godina_rodenja']}"))
n += 1
# Upsert klub OIB if found
if parsed['meta'].get('oib'):
cu.execute("""UPDATE pgz_sport.klubovi
SET oib=COALESCE(NULLIF(oib,''),%s),
web_stranica=COALESCE(NULLIF(web_stranica,''), %s),
source_synced_at=now()
WHERE id=%s""",
(parsed['meta']['oib'], f"{BASE}/klubovi/{used_slug}/", klub_id_db))
c.commit()
log.info(f"{n} igrača za {klub['naziv']} (slug={used_slug})")
return n
def cmd_all():
with conn() as c:
cu = c.cursor()
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE sport='boćanje' AND aktivan=true
ORDER BY id""")
kids = [r[0] for r in cu.fetchall()]
log.info(f"Sweeping {len(kids)} boćarski klubovi (PGŽ)")
total = 0; found_clubs = 0
for kid in kids:
try:
n = cmd_klub(kid)
total += n
if n > 0: found_clubs += 1
except Exception as e:
log.error(f"klub {kid}: {e}")
log.info(f"DONE: {total} igrača iz {found_clubs}/{len(kids)} klubova")
if __name__ == '__main__':
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
cmd = sys.argv[1]
if cmd == 'probe':
html = fetch(f"{BASE}/klubovi/{sys.argv[2]}/")
out = parse_klub_page(html)
import json
print(json.dumps(out, ensure_ascii=False, indent=2))
elif cmd == 'klub':
cmd_klub(int(sys.argv[2]))
elif cmd == 'all':
cmd_all()
else:
print(f"unknown: {cmd}"); sys.exit(2)
+5
View File
@@ -0,0 +1,5 @@
#!/bin/bash
# Daily HBS scrape + lige refresh
cd /opt/pgz-sport/scrapers
python3 hbs_scraper.py >> /opt/pgz-sport/_logs/hbs_cron.log 2>&1
python3 /tmp/b3.py >> /opt/pgz-sport/_logs/hbs_lige_cron.log 2>&1
+161
View File
@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
import psycopg2, requests, re, html as h, time
from datetime import datetime
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
sess = requests.Session()
sess.headers.update({"User-Agent": UA})
LIGE = [
# (url, naziv, sezona, razina)
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
]
# PGŽ klubovi keywords - to mark relevance
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
def fetch(url):
try:
r = sess.get(url, timeout=20)
return r.text if r.status_code == 200 else None
except: return None
def parse_table(html_text):
"""Parse HBS liga tablica. Returns list of dicts."""
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
if not rows: return []
# First row is header
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
# Required: Poredak, Klub
if not any("oredak" in h or "Poz" in h for h in headers): return []
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
if len(cells_clean) < 5: continue
try:
poz = int(cells_clean[0])
naziv = cells_clean[1]
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
razl_str = cells_clean[8].replace('+','').replace('','-').replace(',','') if len(cells_clean)>8 else "0"
try: razl = int(razl_str)
except: razl = 0
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
out.append({
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
})
except (ValueError, IndexError):
continue
return out
def find_klub_id(cur, naziv):
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
# Try exact
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
r = cur.fetchone()
if r: return r[0]
# Try with BK/MK/ŽBK prefix
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
(f"%{prefix} {naziv}%",))
r = cur.fetchone()
if r: return r[0]
# Last try: contains
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cur.fetchone()
return r[0] if r else None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Find boćanje savez_id
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
r = cr.fetchone()
savez_id = r[0] if r else None
print(f"savez_id (HBS): {savez_id}")
total_natj = 0; total_redova = 0; total_pgz_klub = 0
for url, naziv, sezona, razina in LIGE:
print(f"\n=== {naziv} {sezona} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = parse_table(body)
if not rows: print(" no rows parsed"); continue
# Check PGZ relevance
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
# Insert natjecanje
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
cr.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
ON CONFLICT (source, external_id) DO UPDATE SET
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
RETURNING id
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
natj_id = cr.fetchone()[0]
total_natj += 1
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
# Clear old data for this natjecanje + insert new
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
for r in rows:
kid = find_klub_id(cr, r["klub"])
if kid: total_pgz_klub += 1
cr.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
total_redova += 1
time.sleep(0.6)
print(f"\n=== TOTAL ===")
print(f" natjecanja: {total_natj}")
print(f" tablice rows: {total_redova}")
print(f" matched klub_id: {total_pgz_klub}")
# Verify
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
conn.close()
+161
View File
@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
import psycopg2, requests, re, html as h, time
from datetime import datetime
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
sess = requests.Session()
sess.headers.update({"User-Agent": UA})
LIGE = [
# (url, naziv, sezona, razina)
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
]
# PGŽ klubovi keywords - to mark relevance
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
def fetch(url):
try:
r = sess.get(url, timeout=20)
return r.text if r.status_code == 200 else None
except: return None
def parse_table(html_text):
"""Parse HBS liga tablica. Returns list of dicts."""
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
if not rows: return []
# First row is header
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
# Required: Poredak, Klub
if not any("oredak" in h or "Poz" in h for h in headers): return []
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
if len(cells_clean) < 5: continue
try:
poz = int(cells_clean[0])
naziv = cells_clean[1]
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
razl_str = cells_clean[8].replace('+','').replace('','-').replace(',','') if len(cells_clean)>8 else "0"
try: razl = int(razl_str)
except: razl = 0
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
out.append({
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
})
except (ValueError, IndexError):
continue
return out
def find_klub_id(cur, naziv):
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
# Try exact
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
r = cur.fetchone()
if r: return r[0]
# Try with BK/MK/ŽBK prefix
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
(f"%{prefix} {naziv}%",))
r = cur.fetchone()
if r: return r[0]
# Last try: contains
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cur.fetchone()
return r[0] if r else None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Find boćanje savez_id
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
r = cr.fetchone()
savez_id = r[0] if r else None
print(f"savez_id (HBS): {savez_id}")
total_natj = 0; total_redova = 0; total_pgz_klub = 0
for url, naziv, sezona, razina in LIGE:
print(f"\n=== {naziv} {sezona} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = parse_table(body)
if not rows: print(" no rows parsed"); continue
# Check PGZ relevance
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
# Insert natjecanje
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
cr.execute("""
INSERT INTO pgz_sport.natjecanja
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
ON CONFLICT (source, external_id) DO UPDATE SET
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
RETURNING id
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
natj_id = cr.fetchone()[0]
total_natj += 1
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
# Clear old data for this natjecanje + insert new
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
for r in rows:
kid = find_klub_id(cr, r["klub"])
if kid: total_pgz_klub += 1
cr.execute("""
INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
total_redova += 1
time.sleep(0.6)
print(f"\n=== TOTAL ===")
print(f" natjecanja: {total_natj}")
print(f" tablice rows: {total_redova}")
print(f" matched klub_id: {total_pgz_klub}")
# Verify
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
conn.close()
+229
View File
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
"""
import os, re, sys, time, json, html as ht
import urllib.request, urllib.parse
import subprocess
import psycopg2
import datetime as dt
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
TMP = "/tmp/hbs_pdf"
os.makedirs(TMP, exist_ok=True)
DELAY = 1.0
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line + "\n")
except: pass
def fetch_html(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
log(f"FETCH err {url}: {e}")
return None
def fetch_pdf(url, dst):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=30) as r:
with open(dst, "wb") as f: f.write(r.read())
return True
except Exception as e:
log(f"PDF fetch err {url}: {e}")
return False
def pdf_text(path):
try:
out = subprocess.run(["pdftotext", "-layout", path, "-"],
capture_output=True, timeout=20, check=False)
return out.stdout.decode("utf-8", errors="replace")
except Exception as e:
return ""
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def discover_pdfs(godina):
"""Discover all PDF result links for a given year from natjecanja pages."""
pdfs = []
pages = [
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
]
for natj, slug in pages:
h = fetch_html(BASE + slug)
if not h: continue
# Find PDF links (they go through /cdn/content/...pdf)
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
url, label = m.group(1), m.group(2).strip()
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
pos = m.start()
ctx_window = h[max(0, pos-2000):pos]
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
kat = kat_m[-1].strip() if kat_m else "?"
disc = disc_m[-1].strip() if disc_m else label
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
time.sleep(DELAY)
return pdfs
def parse_pdf_for_top3(text, pdf_meta):
"""
Extract top-3 plasmans from a Prvenstvo PDF.
Strategy: PDF often has final placements at the END showing
"1. Klub | Player A, Player B" etc.
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
"""
results = []
# Pattern 1: "Pl. Igrač Klub" tabular format at end
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
# Try simple first: look for medalje/poredak section
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
if poredak_idx < 0:
# Use last 30% of doc
poredak_idx = int(len(text) * 0.7)
tail = text[poredak_idx:]
# Lines that start with rank
for ln in tail.split("\n"):
ln = ln.strip()
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
if m:
rank = int(m.group(1))
ime_full = m.group(2).strip()
klub = m.group(3).strip()
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
else:
# Single name on line followed by line with rank
pass
# Dedup by rank (take first)
seen = set(); uniq = []
for r in results:
if r["rank"] not in seen and r["rank"] <= 8:
seen.add(r["rank"])
uniq.append(r)
return uniq
def find_clan(cr, ime_full):
"""Try to match ime_full to clanovi.id."""
parts = ime_full.split()
if len(parts) < 2: return None
# Try ime+prezime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
(parts[0], " ".join(parts[1:])))
r = cr.fetchone()
if r: return r[0]
# Try last word as prezime, first as ime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
(parts[0], parts[-1]))
r = cr.fetchone()
if r: return r[0]
# Try anywhere
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
(ime_full,))
r = cr.fetchone()
if r: return r[0]
return None
def find_klub(cr, klub_name):
if not klub_name: return None
cr.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
(f"%{klub_name.lower()}%",))
r = cr.fetchone()
return r[0] if r else None
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
def main():
conn = db(); cr = conn.cursor()
log("=== HBS PDF results scraper START ===")
all_pdfs = []
for godina in [2025, 2024, 2023]:
log(f"Discovering year {godina}")
pdfs = discover_pdfs(godina)
log(f" {godina}: {len(pdfs)} PDFs")
all_pdfs.extend(pdfs)
log(f"Total PDFs to process: {len(all_pdfs)}")
inserted = 0; matched_clan = 0; processed = 0
for pdf in all_pdfs:
processed += 1
url = pdf["url"]
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
local = f"{TMP}/{fname}"
if not os.path.exists(local):
if not fetch_pdf(url, local):
continue
time.sleep(DELAY)
text = pdf_text(local)
if not text or len(text) < 200:
continue
top3 = parse_pdf_for_top3(text, pdf)
if processed % 30 == 0:
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
natjecanje = f"{pdf['natj']} {pdf['godina']}"
for r in top3:
clan_id = find_clan(cr, r["ime_full"])
if clan_id: matched_clan += 1
klub_id = find_klub(cr, r["klub"])
try:
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
napomena, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING""",
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
r["rank"], PLAS_TO_MED.get(r["rank"]),
pdf["label"], "hbs_pdf_results", url))
if cr.rowcount: inserted += 1
except Exception as e:
pass
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
# Show sample of newly inserted
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
FROM pgz_sport.clan_nagrada
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
ORDER BY plasman, godina DESC LIMIT 25""")
print("\n=== SAMPLE matched ===")
for r in cr.fetchall():
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
conn.close()
if __name__ == "__main__":
main()
+229
View File
@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
"""
import os, re, sys, time, json, html as ht
import urllib.request, urllib.parse
import subprocess
import psycopg2
import datetime as dt
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
TMP = "/tmp/hbs_pdf"
os.makedirs(TMP, exist_ok=True)
DELAY = 1.0
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line + "\n")
except: pass
def fetch_html(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except Exception as e:
log(f"FETCH err {url}: {e}")
return None
def fetch_pdf(url, dst):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=30) as r:
with open(dst, "wb") as f: f.write(r.read())
return True
except Exception as e:
log(f"PDF fetch err {url}: {e}")
return False
def pdf_text(path):
try:
out = subprocess.run(["pdftotext", "-layout", path, "-"],
capture_output=True, timeout=20, check=False)
return out.stdout.decode("utf-8", errors="replace")
except Exception as e:
return ""
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def discover_pdfs(godina):
"""Discover all PDF result links for a given year from natjecanja pages."""
pdfs = []
pages = [
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
]
for natj, slug in pages:
h = fetch_html(BASE + slug)
if not h: continue
# Find PDF links (they go through /cdn/content/...pdf)
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
url, label = m.group(1), m.group(2).strip()
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
pos = m.start()
ctx_window = h[max(0, pos-2000):pos]
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
kat = kat_m[-1].strip() if kat_m else "?"
disc = disc_m[-1].strip() if disc_m else label
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
time.sleep(DELAY)
return pdfs
def parse_pdf_for_top3(text, pdf_meta):
"""
Extract top-3 plasmans from a Prvenstvo PDF.
Strategy: PDF often has final placements at the END showing
"1. Klub | Player A, Player B" etc.
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
"""
results = []
# Pattern 1: "Pl. Igrač Klub" tabular format at end
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
# Try simple first: look for medalje/poredak section
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
if poredak_idx < 0:
# Use last 30% of doc
poredak_idx = int(len(text) * 0.7)
tail = text[poredak_idx:]
# Lines that start with rank
for ln in tail.split("\n"):
ln = ln.strip()
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
if m:
rank = int(m.group(1))
ime_full = m.group(2).strip()
klub = m.group(3).strip()
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
else:
# Single name on line followed by line with rank
pass
# Dedup by rank (take first)
seen = set(); uniq = []
for r in results:
if r["rank"] not in seen and r["rank"] <= 8:
seen.add(r["rank"])
uniq.append(r)
return uniq
def find_clan(cr, ime_full):
"""Try to match ime_full to clanovi.id."""
parts = ime_full.split()
if len(parts) < 2: return None
# Try ime+prezime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
(parts[0], " ".join(parts[1:])))
r = cr.fetchone()
if r: return r[0]
# Try last word as prezime, first as ime
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
(parts[0], parts[-1]))
r = cr.fetchone()
if r: return r[0]
# Try anywhere
cr.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
(ime_full,))
r = cr.fetchone()
if r: return r[0]
return None
def find_klub(cr, klub_name):
if not klub_name: return None
cr.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
(f"%{klub_name.lower()}%",))
r = cr.fetchone()
return r[0] if r else None
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
def main():
conn = db(); cr = conn.cursor()
log("=== HBS PDF results scraper START ===")
all_pdfs = []
for godina in [2025, 2024, 2023]:
log(f"Discovering year {godina}…")
pdfs = discover_pdfs(godina)
log(f" {godina}: {len(pdfs)} PDFs")
all_pdfs.extend(pdfs)
log(f"Total PDFs to process: {len(all_pdfs)}")
inserted = 0; matched_clan = 0; processed = 0
for pdf in all_pdfs:
processed += 1
url = pdf["url"]
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
local = f"{TMP}/{fname}"
if not os.path.exists(local):
if not fetch_pdf(url, local):
continue
time.sleep(DELAY)
text = pdf_text(local)
if not text or len(text) < 200:
continue
top3 = parse_pdf_for_top3(text, pdf)
if processed % 30 == 0:
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
natjecanje = f"{pdf['natj']} {pdf['godina']}"
for r in top3:
clan_id = find_clan(cr, r["ime_full"])
if clan_id: matched_clan += 1
klub_id = find_klub(cr, r["klub"])
try:
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
napomena, source, source_url)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING""",
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
r["rank"], PLAS_TO_MED.get(r["rank"]),
pdf["label"], "hbs_pdf_results", url))
if cr.rowcount: inserted += 1
except Exception as e:
pass
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
# Show sample of newly inserted
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
FROM pgz_sport.clan_nagrada
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
ORDER BY plasman, godina DESC LIMIT 25""")
print("\n=== SAMPLE matched ===")
for r in cr.fetchall():
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
conn.close()
if __name__ == "__main__":
main()
+337
View File
@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
import os, re, sys, time, json, html, traceback, datetime as dt
import urllib.request, urllib.error
from urllib.parse import urljoin
import psycopg2
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
DELAY = 1.2
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line+"\n")
except: pass
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def fetch(url, retries=2):
for i in range(retries+1):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code in (404, 410): return None
if i == retries: log(f"HTTP {e.code} {url}"); return None
except Exception as e:
if i == retries: log(f"FETCH err {e} {url}"); return None
time.sleep(DELAY * 2)
# === KLUB PARSER ===
def parse_klub(h, slug):
if not h: return None
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
naziv = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
naziv = cand; break
if not naziv: return None
# Logo
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
logo = urljoin(BASE, m.group(1)) if m else None
info = {}
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
val = html.unescape(val)
if 'županija' in key: info['zupanija'] = val
elif 'liga' in key: info['liga'] = val
elif 'adresa' in key: info['adresa'] = val
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
elif 'tel' in key: info['telefon'] = val
elif 'oib' in key: info['oib'] = val
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
igraci = []
for m in re.finditer(
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
h
):
igraci.append({
"slug": m.group(1),
"iskaznica": m.group(2).strip(),
"ime_prezime": html.unescape(m.group(3).strip()),
"godina_rodenja": int(m.group(4))
})
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
voditelji = []
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
if vsec:
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
name = re.sub(r'\s+', ' ', v.group(1).strip())
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
voditelji.append(name)
# fallback bez p tagova
if not voditelji:
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
for line in text.split('\n'):
line = line.strip()
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
parts = line.split()
if all(p[0].isupper() for p in parts[:2] if p):
voditelji.append(line)
return {
"slug": slug, "naziv": naziv, "logo": logo,
"info": info,
"igraci": igraci,
"voditelji": voditelji[:10]
}
# === IGRAČ PARSER ===
def parse_igrac(h, slug):
if not h: return None
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
full_name = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
full_name = cand; break
if not full_name:
full_name = slug.replace("-", " ").title()
parts = full_name.split()
ime = parts[0] if parts else ""
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
# Slika
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
slika = urljoin(BASE, m.group(1)) if m else None
info = {}
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
val = html.unescape(val)
if 'iskaznic' in key: info['iskaznica'] = val
elif 'godina rođenja' in key:
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
except: pass
elif 'matični klub' in key: info['maticni_klub'] = val
# Sportski put - tabela registracije
karijera = []
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
if not table_m:
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
if table_m:
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
for r in rows[1:]: # skip header
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
if len(cells) >= 4:
karijera.append({
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
})
return {
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
"slika_url": slika,
"info": info,
"karijera": karijera
}
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
PGZ_HBS_CLUBS = [
# Senior klubovi
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
# Ženski klubovi
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
"zenska-bocarska-ekipa-kastav-2",
# Kadetske / juniorske ekipe (mlade kategorije)
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
]
def upsert_klub(conn, k):
cur = conn.cursor()
# Check by hbs slug in napomena, then by sport+naziv
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
row = cur.fetchone()
if not row:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
row = cur.fetchone()
info = k.get('info', {})
naziv = k['naziv']
grad = None
if info.get('adresa'):
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
if m: grad = m.group(1)
if row:
kid = row[0]
cur.execute("""UPDATE pgz_sport.klubovi SET
adresa=COALESCE(%s, adresa),
telefon=COALESCE(%s, telefon),
grad=COALESCE(%s, grad),
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
WHERE id=%s""",
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
else:
cur.execute("""INSERT INTO pgz_sport.klubovi
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
RETURNING id""",
(naziv, grad, info.get('adresa'), info.get('telefon'),
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
kid = cur.fetchone()[0]
return kid
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
cur = conn.cursor()
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
# Check by iskaznica (HBS unique ID)
cid = None
if iskaznica:
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
row = cur.fetchone()
if row: cid = row[0]
god = p.get('info', {}).get('godina_rodenja')
src_url = f"{BASE}/igraci/{p['slug']}/"
if cid:
cur.execute("""UPDATE pgz_sport.clanovi SET
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
klub_id=%s, klub_naziv_godisnjak=%s,
slika_url=COALESCE(%s, slika_url),
godina_rodenja=COALESCE(%s, godina_rodenja),
slug=%s,
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
WHERE id=%s""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url, cid))
else:
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
RETURNING id""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url))
cid = cur.fetchone()[0]
return cid
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
"""Voditelji ekipe = treneri"""
cur = conn.cursor()
parts = name.strip().split()
if len(parts) < 2: return None
ime, prezime = parts[0], " ".join(parts[1:])
cur.execute("""SELECT id FROM pgz_sport.clanovi
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
(ime, prezime))
row = cur.fetchone()
if row:
cur.execute("""UPDATE pgz_sport.clanovi SET
uloga=%s, klub_id=COALESCE(klub_id, %s),
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
source_url=COALESCE(source_url, %s)
WHERE id=%s""",
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
return row[0]
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
source, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
RETURNING id""",
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
return cur.fetchone()[0]
def main():
conn = db()
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
success = 0; players_total = 0
for slug in PGZ_HBS_CLUBS:
url = f"{BASE}/klubovi/{slug}/"
log(f"→ KLUB {slug}")
h = fetch(url)
if not h: log(f" ✗ klub ne postoji ili 404"); continue
parsed = parse_klub(h, slug)
if not parsed: log(f" ✗ ne mogu parse"); continue
kid = upsert_klub(conn, parsed)
log(f"{parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
success += 1
# Voditelji
for v in parsed['voditelji']:
try:
upsert_voditelj(conn, v, kid, parsed['naziv'])
log(f" ✓ voditelj: {v}")
except Exception as e:
log(f" ✗ voditelj {v}: {e}")
# Igrači - dohvati profil za svakog
for ig in parsed['igraci']:
time.sleep(DELAY)
try:
purl = f"{BASE}/igraci/{ig['slug']}/"
ph = fetch(purl)
if not ph: continue
pdata = parse_igrac(ph, ig['slug'])
if not pdata: continue
# Override sa fallback iz lista ako parser pogrešno
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
pdata['full_name'] = ig['ime_prezime']
parts = ig['ime_prezime'].split()
pdata['ime'] = parts[0] if parts else ''
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
# Iskaznica from list (mora biti tu)
if not pdata.get('info', {}).get('iskaznica'):
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
if not pdata.get('info', {}).get('godina_rodenja'):
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
players_total += 1
log(f"{pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
except Exception as e:
log(f" ✗ igrač {ig['slug']}: {e}")
time.sleep(DELAY)
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
conn.close()
if __name__ == "__main__":
main()
+337
View File
@@ -0,0 +1,337 @@
#!/usr/bin/env python3
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
import os, re, sys, time, json, html, traceback, datetime as dt
import urllib.request, urllib.error
from urllib.parse import urljoin
import psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
BASE = "https://hrvatski-bocarski-savez.hr"
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
DELAY = 1.2
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
def log(msg):
line = f"[{dt.datetime.now().isoformat()}] {msg}"
print(line, flush=True)
try:
with open(LOG_FP, "a") as f: f.write(line+"\n")
except: pass
def db():
c = psycopg2.connect(**DB); c.autocommit = True; return c
def fetch(url, retries=2):
for i in range(retries+1):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode("utf-8", errors="replace")
except urllib.error.HTTPError as e:
if e.code in (404, 410): return None
if i == retries: log(f"HTTP {e.code} {url}"); return None
except Exception as e:
if i == retries: log(f"FETCH err {e} {url}"); return None
time.sleep(DELAY * 2)
# === KLUB PARSER ===
def parse_klub(h, slug):
if not h: return None
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
naziv = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
naziv = cand; break
if not naziv: return None
# Logo
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
logo = urljoin(BASE, m.group(1)) if m else None
info = {}
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
val = html.unescape(val)
if 'županija' in key: info['zupanija'] = val
elif 'liga' in key: info['liga'] = val
elif 'adresa' in key: info['adresa'] = val
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
elif 'tel' in key: info['telefon'] = val
elif 'oib' in key: info['oib'] = val
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
igraci = []
for m in re.finditer(
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
h
):
igraci.append({
"slug": m.group(1),
"iskaznica": m.group(2).strip(),
"ime_prezime": html.unescape(m.group(3).strip()),
"godina_rodenja": int(m.group(4))
})
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
voditelji = []
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
if vsec:
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
name = re.sub(r'\s+', ' ', v.group(1).strip())
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
voditelji.append(name)
# fallback bez p tagova
if not voditelji:
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
for line in text.split('\n'):
line = line.strip()
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
parts = line.split()
if all(p[0].isupper() for p in parts[:2] if p):
voditelji.append(line)
return {
"slug": slug, "naziv": naziv, "logo": logo,
"info": info,
"igraci": igraci,
"voditelji": voditelji[:10]
}
# === IGRAČ PARSER ===
def parse_igrac(h, slug):
if not h: return None
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
full_name = None
for cand in h3s:
cand = html.unescape(cand.strip())
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
full_name = cand; break
if not full_name:
full_name = slug.replace("-", " ").title()
parts = full_name.split()
ime = parts[0] if parts else ""
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
# Slika
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
slika = urljoin(BASE, m.group(1)) if m else None
info = {}
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
key = m.group(1).strip().lower()
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
val = html.unescape(val)
if 'iskaznic' in key: info['iskaznica'] = val
elif 'godina rođenja' in key:
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
except: pass
elif 'matični klub' in key: info['maticni_klub'] = val
# Sportski put - tabela registracije
karijera = []
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
if not table_m:
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
if table_m:
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
for r in rows[1:]: # skip header
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
if len(cells) >= 4:
karijera.append({
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
})
return {
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
"slika_url": slika,
"info": info,
"karijera": karijera
}
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
PGZ_HBS_CLUBS = [
# Senior klubovi
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
# Ženski klubovi
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
"zenska-bocarska-ekipa-kastav-2",
# Kadetske / juniorske ekipe (mlade kategorije)
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
]
def upsert_klub(conn, k):
cur = conn.cursor()
# Check by hbs slug in napomena, then by sport+naziv
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
row = cur.fetchone()
if not row:
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
row = cur.fetchone()
info = k.get('info', {})
naziv = k['naziv']
grad = None
if info.get('adresa'):
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
if m: grad = m.group(1)
if row:
kid = row[0]
cur.execute("""UPDATE pgz_sport.klubovi SET
adresa=COALESCE(%s, adresa),
telefon=COALESCE(%s, telefon),
grad=COALESCE(%s, grad),
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
WHERE id=%s""",
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
else:
cur.execute("""INSERT INTO pgz_sport.klubovi
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
RETURNING id""",
(naziv, grad, info.get('adresa'), info.get('telefon'),
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
kid = cur.fetchone()[0]
return kid
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
cur = conn.cursor()
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
# Check by iskaznica (HBS unique ID)
cid = None
if iskaznica:
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
row = cur.fetchone()
if row: cid = row[0]
god = p.get('info', {}).get('godina_rodenja')
src_url = f"{BASE}/igraci/{p['slug']}/"
if cid:
cur.execute("""UPDATE pgz_sport.clanovi SET
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
klub_id=%s, klub_naziv_godisnjak=%s,
slika_url=COALESCE(%s, slika_url),
godina_rodenja=COALESCE(%s, godina_rodenja),
slug=%s,
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
WHERE id=%s""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url, cid))
else:
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
RETURNING id""",
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
god, p['slug'], iskaznica, src_url))
cid = cur.fetchone()[0]
return cid
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
"""Voditelji ekipe = treneri"""
cur = conn.cursor()
parts = name.strip().split()
if len(parts) < 2: return None
ime, prezime = parts[0], " ".join(parts[1:])
cur.execute("""SELECT id FROM pgz_sport.clanovi
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
(ime, prezime))
row = cur.fetchone()
if row:
cur.execute("""UPDATE pgz_sport.clanovi SET
uloga=%s, klub_id=COALESCE(klub_id, %s),
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
source_url=COALESCE(source_url, %s)
WHERE id=%s""",
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
return row[0]
cur.execute("""INSERT INTO pgz_sport.clanovi
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
source, source_url, source_synced_at)
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
RETURNING id""",
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
return cur.fetchone()[0]
def main():
conn = db()
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
success = 0; players_total = 0
for slug in PGZ_HBS_CLUBS:
url = f"{BASE}/klubovi/{slug}/"
log(f"→ KLUB {slug}")
h = fetch(url)
if not h: log(f" ✗ klub ne postoji ili 404"); continue
parsed = parse_klub(h, slug)
if not parsed: log(f" ✗ ne mogu parse"); continue
kid = upsert_klub(conn, parsed)
log(f" ✓ {parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
success += 1
# Voditelji
for v in parsed['voditelji']:
try:
upsert_voditelj(conn, v, kid, parsed['naziv'])
log(f" ✓ voditelj: {v}")
except Exception as e:
log(f" ✗ voditelj {v}: {e}")
# Igrači - dohvati profil za svakog
for ig in parsed['igraci']:
time.sleep(DELAY)
try:
purl = f"{BASE}/igraci/{ig['slug']}/"
ph = fetch(purl)
if not ph: continue
pdata = parse_igrac(ph, ig['slug'])
if not pdata: continue
# Override sa fallback iz lista ako parser pogrešno
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
pdata['full_name'] = ig['ime_prezime']
parts = ig['ime_prezime'].split()
pdata['ime'] = parts[0] if parts else ''
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
# Iskaznica from list (mora biti tu)
if not pdata.get('info', {}).get('iskaznica'):
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
if not pdata.get('info', {}).get('godina_rodenja'):
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
players_total += 1
log(f" ✓ {pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
except Exception as e:
log(f" ✗ igrač {ig['slug']}: {e}")
time.sleep(DELAY)
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
conn.close()
if __name__ == "__main__":
main()
+173
View File
@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
Path: /opt/pgz-sport/scrapers/hks_scraper.py
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
"""
import requests, re, json, psycopg2, html as ihtml
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
# Competition ID lookup - 2025/26 sezona
COMPS = [
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
]
def parse_standings(html):
"""Parse Genius Sports standings HTML table → list of rows."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
if not tables: return []
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
if len(rows_html) < 2: return []
standings = []
for row in rows_html[1:]: # skip header
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
if len(cells) < 8: continue
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
try:
poz = int(clean[0])
except (ValueError, IndexError): continue
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
team_raw = clean[2]
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
if m:
klub = m.group(1).strip()
abbrev = m.group(2)
else:
klub = team_raw
abbrev = None
try:
standings.append({
"poz": poz,
"klub": klub,
"abbrev": abbrev,
"gp": int(clean[3]),
"bod": int(clean[4]),
"pob": int(clean[5]),
"por": int(clean[6]),
"for_pts": int(clean[7].replace(',','')),
"ag_pts": int(clean[8].replace(',','')),
"gd": int(clean[9].replace(',','').replace('+','')),
"ner": 0, # košarka nema neriješeno
})
except (ValueError, IndexError) as e:
continue
return standings
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
total_inserted = 0
pgz_klubovi_seen = set()
for comp in COMPS:
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
try:
r = requests.get(api_url, headers=HDR, timeout=20)
d = r.json()
html = d.get('html', '')
except Exception as e:
print(f" ERR fetch: {e}"); continue
rows = parse_standings(html)
print(f" Parsed {len(rows)} klubova")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
nr = cu.fetchone()
if nr:
natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja
(naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
RETURNING id""",
(comp['natj'], comp['razina'], comp['url']))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
# Clear old rows
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
for r in rows:
# Match klub
klub_id = None
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
# Check if PGŽ
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
kdata = cu.fetchone()
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
pgz_klubovi_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija,
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
'hks_genius', %s, now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
comp['url'],
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
# Stats
cu.execute("""SELECT n.naziv, count(t.*),
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
WHERE n.source='hks_genius'
GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HKS lige stats ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
# PGŽ klubovi
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hks_genius' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HKS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
conn.close()
if __name__ == "__main__":
main()
+173
View File
@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
Path: /opt/pgz-sport/scrapers/hks_scraper.py
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
"""
import requests, re, json, psycopg2, html as ihtml
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
# Competition ID lookup - 2025/26 sezona
COMPS = [
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
]
def parse_standings(html):
"""Parse Genius Sports standings HTML table → list of rows."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
if not tables: return []
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
if len(rows_html) < 2: return []
standings = []
for row in rows_html[1:]: # skip header
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
if len(cells) < 8: continue
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
try:
poz = int(clean[0])
except (ValueError, IndexError): continue
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
team_raw = clean[2]
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
if m:
klub = m.group(1).strip()
abbrev = m.group(2)
else:
klub = team_raw
abbrev = None
try:
standings.append({
"poz": poz,
"klub": klub,
"abbrev": abbrev,
"gp": int(clean[3]),
"bod": int(clean[4]),
"pob": int(clean[5]),
"por": int(clean[6]),
"for_pts": int(clean[7].replace(',','')),
"ag_pts": int(clean[8].replace(',','')),
"gd": int(clean[9].replace(',','').replace('+','')),
"ner": 0, # košarka nema neriješeno
})
except (ValueError, IndexError) as e:
continue
return standings
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
total_inserted = 0
pgz_klubovi_seen = set()
for comp in COMPS:
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
try:
r = requests.get(api_url, headers=HDR, timeout=20)
d = r.json()
html = d.get('html', '')
except Exception as e:
print(f" ERR fetch: {e}"); continue
rows = parse_standings(html)
print(f" Parsed {len(rows)} klubova")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
nr = cu.fetchone()
if nr:
natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja
(naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
RETURNING id""",
(comp['natj'], comp['razina'], comp['url']))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
# Clear old rows
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
for r in rows:
# Match klub
klub_id = None
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
# Check if PGŽ
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
kdata = cu.fetchone()
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
pgz_klubovi_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija,
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
'hks_genius', %s, now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
comp['url'],
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
# Stats
cu.execute("""SELECT n.naziv, count(t.*),
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
WHERE n.source='hks_genius'
GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HKS lige stats ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
# PGŽ klubovi
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hks_genius' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HKS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
conn.close()
if __name__ == "__main__":
main()
+102
View File
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
import requests, re, html as h_unesc, psycopg2
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0"
s = requests.Session()
s.headers.update({"User-Agent": UA})
def fetch(url):
try:
r = s.get(url, timeout=15)
return r.text if r.status_code == 200 else None
except: return None
def find_table_with_header(html_text, header_marker="Klub"):
"""Find table that contains 'Klub' in header."""
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
for t in tables:
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
if not rows: continue
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
if any(header_marker in h for h in h_clean):
return rows
return None
def parse_standings_rows(rows):
"""Parse rows; expect first is header."""
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
if len(cleaned) < 8: continue
try:
poz = int(cleaned[0].rstrip('.'))
klub = cleaned[1]
odigrano = int(cleaned[2])
pobjede = int(cleaned[3])
nerij = int(cleaned[4])
porazi = int(cleaned[5])
gz = int(cleaned[6])
gp = int(cleaned[7])
razl = int(cleaned[8].replace('+','').replace('','-'))
bod = int(cleaned[9])
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
except (ValueError, IndexError):
continue
return out
# === Sources ===
LIGE = [
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
]
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
def find_klub(naziv):
# HNK / NK prefixes
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
r = cr.fetchone()
if r: return r[0]
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cr.fetchone()
return r[0] if r else None
for url, naziv, razina, ext_id, sezona in LIGE:
print(f"=== {naziv} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = find_table_with_header(body, "Klub")
if not rows: print(" no table found"); continue
parsed = parse_standings_rows(rows)
print(f" {len(parsed)} rows parsed")
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
nid = cr.fetchone()[0]
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
matched = 0
for r in parsed:
kid = find_klub(r["klub"])
if kid: matched += 1
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
porazi, gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
print(f" matched klub_id: {matched}/{len(parsed)}")
# Verify Rijeka
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
for r in cr.fetchall():
print(f" {r}")
conn.close()
+102
View File
@@ -0,0 +1,102 @@
#!/usr/bin/env python3
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
import requests, re, html as h_unesc, psycopg2
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0"
s = requests.Session()
s.headers.update({"User-Agent": UA})
def fetch(url):
try:
r = s.get(url, timeout=15)
return r.text if r.status_code == 200 else None
except: return None
def find_table_with_header(html_text, header_marker="Klub"):
"""Find table that contains 'Klub' in header."""
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
for t in tables:
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
if not rows: continue
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
if any(header_marker in h for h in h_clean):
return rows
return None
def parse_standings_rows(rows):
"""Parse rows; expect first is header."""
out = []
for row in rows[1:]:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
if len(cleaned) < 8: continue
try:
poz = int(cleaned[0].rstrip('.'))
klub = cleaned[1]
odigrano = int(cleaned[2])
pobjede = int(cleaned[3])
nerij = int(cleaned[4])
porazi = int(cleaned[5])
gz = int(cleaned[6])
gp = int(cleaned[7])
razl = int(cleaned[8].replace('+','').replace('','-'))
bod = int(cleaned[9])
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
except (ValueError, IndexError):
continue
return out
# === Sources ===
LIGE = [
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
]
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
def find_klub(naziv):
# HNK / NK prefixes
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
r = cr.fetchone()
if r: return r[0]
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
r = cr.fetchone()
return r[0] if r else None
for url, naziv, razina, ext_id, sezona in LIGE:
print(f"=== {naziv} ===")
body = fetch(url)
if not body: print(" fetch failed"); continue
rows = find_table_with_header(body, "Klub")
if not rows: print(" no table found"); continue
parsed = parse_standings_rows(rows)
print(f" {len(parsed)} rows parsed")
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
nid = cr.fetchone()[0]
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
matched = 0
for r in parsed:
kid = find_klub(r["klub"])
if kid: matched += 1
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
porazi, gol_z, gol_p, gol_razlika, bodovi)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
print(f" matched klub_id: {matched}/{len(parsed)}")
# Verify Rijeka
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
for r in cr.fetchall():
print(f" {r}")
conn.close()
+173
View File
@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
HNS Semafor ligaški scraper - parses body.innerText (SPA, no <table> tags)
Path: /opt/pgz-sport/scrapers/hns_lige_standings.py
Author: Damir Radulić / Ri.NET
Source: https://semafor.hns.family/natjecanja/...
Output: pgz_sport.natjecanja_tablice (source=hns_semafor)
Run: python3 hns_lige_standings.py
"""
from playwright.async_api import async_playwright
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
LIGE = [
{"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"},
{"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"},
{"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"},
{"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"},
{"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"},
]
async def scrape_one(page, liga):
print(f"\n=== {liga['natj']} ===", flush=True)
try:
await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000)
await page.wait_for_timeout(2000)
try:
btn = await page.query_selector(".cky-btn-accept")
if btn:
await btn.click()
await page.wait_for_timeout(1500)
except: pass
await page.wait_for_timeout(4000)
body_text = await page.evaluate("() => document.body.innerText")
idx = body_text.find('\nBod\n')
if idx < 0:
idx = body_text.find('\nBod ')
if idx < 0:
print(" No Ljestvica found"); return []
ljestvica_text = body_text[idx+5:idx+5+8000]
lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()]
# Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars)
lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')]
rows = []
i = 0
while i < len(lines):
try:
poz = int(lines[i])
if poz > 50 or poz < 1:
i += 1; continue
klub = lines[i+1]
if klub.isdigit() or len(klub) < 3:
i += 1; continue
uk = int(lines[i+2])
pob = int(lines[i+3])
ner = int(lines[i+4])
por = int(lines[i+5])
gp = int(lines[i+6])
gpr = int(lines[i+7])
gr_raw = lines[i+8].strip()
gr = int(gr_raw.replace('+',''))
bod = int(lines[i+9])
rows.append({
"poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por,
"gp": gp, "gpr": gpr, "gr": gr, "bod": bod
})
i += 10
except (ValueError, IndexError):
i += 1
print(f" Parsed {len(rows)} klubova")
for r in rows[:3]:
print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}")
return rows
except Exception as e:
print(f" ERR: {e}")
return []
async def run():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36")
page = await ctx.new_page()
all_inserted = 0
for liga in LIGE:
rows = await scrape_one(page, liga)
if not rows: continue
cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],))
nr = cu.fetchone()
if nr:
natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja
(naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s)
RETURNING id""",
(liga['natj'], liga['razina'], liga['url']))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
# Clear old rows for this natjecanje (no sezona col)
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'",
(natj_id,))
for r in rows:
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='nogomet' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
klub_id = kr[0] if kr else None
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija,
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
gol_razlika, bodovi, source, source_url, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
'hns_semafor', %s, now())""",
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'],
r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url']))
all_inserted += 1
await browser.close()
print(f"\n=== TOTAL inserted: {all_inserted} rows ===")
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""",
(f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',))
cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t
ON n.id=t.natjecanje_id AND t.source='hns_semafor'
WHERE n.source='hns_semafor' AND n.sezona='2025/26'
GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HNS lige stats ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova - {r[0]}")
# PGŽ klubovi u tablicama
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hns_semafor'
AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%'
OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%'
OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%'
OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%'
OR lower(t.klub_naziv) LIKE '%%kvarner%%')
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===")
for r in cu.fetchall():
match = f"klub_id={r[5]}" if r[5] else "❌ no match"
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}")
conn.close()
asyncio.run(run())
+608
View File
@@ -0,0 +1,608 @@
#!/usr/bin/env python3
"""
HNS Semafor scraper for PGŽ football clubs.
Strategy:
1. Seed-map known PGŽ clubs to HNS COMET klub_id (manual list to start)
2. For each klub: fetch /klubovi/{id}/{slug}/ and extract roster (player list)
3. For each player: fetch /igraci/{id}/{slug}/ → store in clanovi + utakmice_log
4. Respect rate limit (1 req / 1.5s), record run in scraper_runs
Run modes:
python hns_semafor.py seed # set hns_klub_id for known clubs
python hns_semafor.py klub <db_klub_id> # scrape one klub roster + players
python hns_semafor.py player <hns_pid> # scrape one player
python hns_semafor.py daily # full daily harvest of seeded PGŽ clubs
"""
import os, re, sys, time, json, logging
from datetime import datetime, date
from urllib.parse import urljoin
import psycopg2
import psycopg2.extras
import requests
from bs4 import BeautifulSoup
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
BASE = "https://semafor.hns.family"
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
RATE_S = 1.6 # seconds between requests
TIMEOUT = 25
log = logging.getLogger("hns")
logging.basicConfig(
format="%(asctime)s [%(levelname)s] %(message)s",
level=logging.INFO,
handlers=[
logging.FileHandler("/opt/pgz-sport/_logs/hns_scraper.log"),
logging.StreamHandler(sys.stdout),
],
)
# ═══ Manual seed mapping — PGŽ klubovi → HNS COMET id ═══
# Discovered from semafor.hns.family/igraci/1167145/marko-komadina/ matches
SEED_MAP = {
# naziv → hns_klub_id
"NK Klana": 1569,
"NK Krk": 1558,
"NK Mune": 1576,
"NK Vihor": 4326,
"NK Doker": 107415,
"HNK Kozala": 3090,
"HNK Lovran": 1574,
"HNK Goranin": 1565,
"NK Risnjak": 1583,
"NK Lokomotiva": 1570,
"NK Omladinac Vrata": 1579,
"NK Draga": 1554,
"NK Zamet": 1589,
"NK Vrbovsko": 1588,
"NK Rikard Benčić": 1582,
"NK OŠK Omišalj": 3071,
}
def conn():
return psycopg2.connect(**DB)
def fetch(url: str) -> str:
log.info(f"GET {url}")
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
r.raise_for_status()
time.sleep(RATE_S)
return r.text
def slugify(s: str) -> str:
s = s.lower().strip()
s = re.sub(r'[čćš]', lambda m: {'č':'c','ć':'c','š':'s'}[m.group()], s)
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
return s
def cmd_seed():
"""Map SEED_MAP to klubovi.hns_klub_id where naziv matches; auto-INSERT if missing."""
n_updated = 0; n_inserted = 0
with conn() as c:
cu = c.cursor()
for naziv, hns_id in SEED_MAP.items():
cu.execute("""UPDATE pgz_sport.klubovi
SET hns_klub_id=%s, hns_slug=%s, source_synced_at=now()
WHERE naziv ILIKE %s AND sport='nogomet'
AND (hns_klub_id IS NULL OR hns_klub_id=%s)""",
(hns_id, slugify(naziv), f"%{naziv}%", hns_id))
if cu.rowcount > 0:
n_updated += cu.rowcount
continue
# Try by hns_klub_id directly (already set elsewhere)
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (hns_id,))
if cu.fetchone():
continue
# Insert new minimal row
cu.execute("""INSERT INTO pgz_sport.klubovi
(naziv, sport, razina, hns_klub_id, hns_slug, aktivan, region,
source_synced_at, napomena)
VALUES (%s,'nogomet','3.HRL',%s,%s,true,'PGŽ',now(),
'Auto-seeded from HNS Semafor (legitimni interes — analitika)')""",
(naziv, hns_id, slugify(naziv)))
n_inserted += 1
c.commit()
log.info(f"Seed: updated={n_updated}, inserted={n_inserted}")
return {"updated": n_updated, "inserted": n_inserted}
def parse_player_profile(hns_pid: int, html: str) -> dict:
"""Parse /igraci/{id}/{slug}/ → dict."""
soup = BeautifulSoup(html, 'html.parser')
data = {"hns_pid": hns_pid, "matches": []}
# Name in first <h1>
h1 = soup.find('h1')
if h1:
data['ime_prezime'] = h1.get_text(' ', strip=True)
# Photo
img = soup.find('img', alt=data.get('ime_prezime', ''))
if img and img.get('src'):
data['slika_url'] = img['src']
# Trenutni klub — find h4 with link (klub heading)
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
if klub_link:
m = re.search(r'/klubovi/(\d+)/', klub_link['href'])
if m: data['trenutni_klub_hns_id'] = int(m.group(1))
h = klub_link.find('h4')
if h: data['trenutni_klub'] = h.get_text(' ', strip=True)
# Datum rođenja - targetira <li class="dob"> direktno
li_dob = soup.find('li', class_='dob')
if li_dob:
h4 = li_dob.find('h4')
if h4:
t = h4.get_text(' ', strip=True)
data['datum_rodenja_raw'] = t
m = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
if m:
try:
d = m.groups()
data['datum_rodenja'] = date(int(d[2]), int(d[1]), int(d[0])).isoformat()
except Exception:
pass
# Mjesto rođenja - targetira <li class="pob">
li_pob = soup.find('li', class_='pob')
if li_pob:
h4_m = li_pob.find('h4')
if h4_m:
data['mjesto_rodenja'] = h4_m.get_text(strip=True)
# Stara fallback metoda - h4 followed by h3 "Mjesto rođenja"
for h3 in soup.find_all('h3'):
if 'Mjesto rođenja' in h3.get_text():
prev = h3.find_previous('h4')
if prev: data['mjesto_rodenja'] = prev.get_text(strip=True)
return data
def upsert_player(klub_id_db: int, prof: dict) -> int:
"""Upsert clanovi row from parsed profile, return clan_id."""
with conn() as c:
cu = c.cursor()
# Try find existing by source_id
cu.execute("""SELECT id FROM pgz_sport.clanovi
WHERE source='hns_semafor' AND source_id=%s""", (str(prof['hns_pid']),))
row = cu.fetchone()
ime, *prezime = (prof.get('ime_prezime','') or '').split(' ', 1)
prezime = prezime[0] if prezime else ''
url = f"{BASE}/igraci/{prof['hns_pid']}/{slugify(prof.get('ime_prezime',''))}/"
if row:
cid = row[0]
cu.execute("""UPDATE pgz_sport.clanovi
SET ime=%s, prezime=%s, datum_rodenja=%s, mjesto_rodenja=%s,
slika_url=%s, klub_id=%s, source_url=%s, source_synced_at=now()
WHERE id=%s""",
(ime, prezime, prof.get('datum_rodenja'), prof.get('mjesto_rodenja'),
prof.get('slika_url'), klub_id_db, url, cid))
else:
cu.execute("""INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, datum_rodenja, mjesto_rodenja, slika_url,
source, source_id, source_url, source_synced_at, slug)
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
RETURNING id""",
(klub_id_db, ime, prezime, prof.get('datum_rodenja'),
prof.get('mjesto_rodenja'), prof.get('slika_url'),
str(prof['hns_pid']), url, slugify(prof.get('ime_prezime',''))))
cid = cu.fetchone()[0]
c.commit()
return cid
def cmd_player(hns_pid: int, klub_id_db: int = None):
"""Scrape a single player by HNS ID."""
if klub_id_db is None:
# try to infer from current klub via DB if previously stored
with conn() as c:
cu = c.cursor()
cu.execute("""SELECT klub_id FROM pgz_sport.clanovi
WHERE source='hns_semafor' AND source_id=%s""", (str(hns_pid),))
r = cu.fetchone()
if r: klub_id_db = r[0]
url = f"{BASE}/igraci/{hns_pid}/dummy/" # slug is forgiving; HNS redirects
html = fetch(url)
prof = parse_player_profile(hns_pid, html)
log.info(f"Parsed: {prof.get('ime_prezime','?')} (HNS#{hns_pid}) klub={prof.get('trenutni_klub','?')}")
# Resolve current_klub_hns_id → klub_id_db if not provided
if klub_id_db is None and prof.get('trenutni_klub_hns_id'):
with conn() as c:
cu = c.cursor()
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (prof['trenutni_klub_hns_id'],))
r = cu.fetchone()
if r: klub_id_db = r[0]
if klub_id_db is None:
log.warning(f"No DB klub_id for HNS player {hns_pid} — skipping upsert")
return None
return upsert_player(klub_id_db, prof)
def cmd_daily():
"""Refresh seeded clubs and their rosters (pull from sample player). To be expanded."""
run_id = None
with conn() as c:
cu = c.cursor()
cu.execute("""INSERT INTO pgz_sport.scraper_runs (source, scope)
VALUES ('hns_semafor','daily') RETURNING id""")
run_id = cu.fetchone()[0]; c.commit()
inserted = 0; updated = 0; errors = []
try:
# Phase 1: ensure seed mapping is current
cmd_seed()
log.info("=== Daily HNS harvest start ===")
# TODO: roster discovery requires per-klub roster page. For now, only re-fetch known players.
with conn() as c:
cu = c.cursor()
cu.execute("""SELECT source_id FROM pgz_sport.clanovi
WHERE source='hns_semafor' ORDER BY source_synced_at NULLS FIRST LIMIT 500""")
pids = [r[0] for r in cu.fetchall()]
for pid in pids:
try:
cmd_player(int(pid))
updated += 1
except Exception as e:
log.error(f"player {pid}: {e}")
errors.append({"pid": pid, "err": str(e)})
log.info(f"=== Daily done: updated={updated} errors={len(errors)} ===")
finally:
with conn() as c:
cu = c.cursor()
cu.execute("""UPDATE pgz_sport.scraper_runs
SET finished_at=now(), status=%s, rows_updated=%s, errors=%s::jsonb, rows_inserted=%s
WHERE id=%s""",
("ok" if not errors else "partial", updated, json.dumps(errors), inserted, run_id))
c.commit()
def parse_match(html, match_url=None):
"""HNS match parser v4 — uses precise class signals.
Player <li class='row match_lineup' data-personid='87561'>:
<div class='shirtNumber'>9</div>
<div class='playerPhoto'><div class='photo'><img src='...' /></div></div>
<div class='playerName'><h3><a href='/igraci/.../'>Ivan Laginja</a></h3>Igrač</div>
<div class='matchEvents'>
<ul class='events'>
<li class='goal'><div class='icon' title='Gol'></div>40'</li>
<li class='substitutionOut'><div class='icon' title='Izmjena'></div>87'</li>
<li class='yellow'>...</li>
<li class='red'>...</li>
<li class='ownGoal'>...</li>
<li class='substitutionIn'>...</li>
</ul>
</div>
"""
soup = BeautifulSoup(html, 'html.parser')
out = {"teams": {}, "match_url": match_url, "meta": {}, "title": ""}
h1 = soup.find('h1')
out['title'] = h1.get_text(' ', strip=True) if h1 else ''
EVENT_KIND_MAP = {
'goal': 'gol',
'ownGoal': 'autogol',
'penaltyGoal': 'gol',
'yellow': 'zuti',
'secondYellow': 'zuti2', # second yellow → effectively red
'red': 'crveni',
'substitutionIn': 'subIn',
'substitutionOut': 'subOut',
}
def parse_team_div(team_div):
if not team_div: return None, []
ul = team_div.find('ul', recursive=False)
if not ul: ul = team_div.find('ul')
if not ul: return None, []
team_name = None
players = []
is_starter = True
for li in ul.find_all('li', recursive=False):
cls = li.get('class') or []
if 'header' in cls and 'clubName' in cls:
team_name = li.get_text(' ', strip=True)
continue
if 'header' in cls and 'separatorTitle' in cls:
if 'Pričuvni' in li.get_text(' ', strip=True):
is_starter = False
continue
if not ('row' in cls and 'match_lineup' in cls):
continue
# Player extraction
pid = li.get('data-personid')
if not pid:
a = li.find('a', href=re.compile(r'/igraci/(\d+)/'))
if not a: continue
pm = re.search(r'/igraci/(\d+)/', a['href'])
pid = pm.group(1)
try: pid = int(pid)
except: continue
# Shirt number
sn = li.find('div', class_='shirtNumber')
broj_dresa = None
if sn:
bs = sn.get_text(' ', strip=True).strip()
if bs.isdigit(): broj_dresa = int(bs)
# Image
img = li.find('img')
slika = img.get('src') if img else None
# Name + position
pn = li.find('div', class_='playerName')
ime_prezime = ''
pozicija = None
captain = False
if pn:
a2 = pn.find('a')
if a2:
ime_prezime = a2.get_text(' ', strip=True)
# Position is text after <h3>
full = pn.get_text(' ', strip=True)
rest = full.replace(ime_prezime, '').strip()
if '(C)' in rest: captain = True
rest = rest.replace('(C)', '').strip()
if 'Vratar' in rest: pozicija = 'Vratar'
elif 'Igrač' in rest: pozicija = 'Igrač'
# Events
events = []
me_div = li.find('div', class_='matchEvents')
if me_div:
ev_ul = me_div.find('ul', class_='events')
if ev_ul:
for ev_li in ev_ul.find_all('li', recursive=False):
ev_cls = ev_li.get('class') or []
kind = None
for k in ev_cls:
if k in EVENT_KIND_MAP:
kind = EVENT_KIND_MAP[k]; break
text = ev_li.get_text(' ', strip=True)
mm = re.search(r"(\d+(?:\+\d+)?)\s*'", text)
minute = mm.group(1) if mm else None
if kind:
events.append({'kind': kind, 'minute': minute})
# Aggregate counts
cnt_gol = sum(1 for e in events if e['kind'] in ('gol',))
cnt_zuti = sum(1 for e in events if e['kind'] == 'zuti')
cnt_crveni = sum(1 for e in events if e['kind'] in ('crveni','zuti2')) # 2nd yellow = red
# Substitution minutes (in/out)
sub_in_min = next((e['minute'] for e in events if e['kind']=='subIn'), None)
sub_out_min = next((e['minute'] for e in events if e['kind']=='subOut'), None)
# Estimate minutes played
minutes = None
if is_starter:
if sub_out_min:
try: minutes = int(re.sub(r'[^\d]','', sub_out_min))
except: pass
else:
minutes = 90 # full game
else: # bench
if sub_in_min:
try: minutes = max(0, 90 - int(re.sub(r'[^\d]','', sub_in_min)))
except: pass
else:
minutes = 0 # never came on
players.append({
'hns_pid': pid,
'ime_prezime': ime_prezime,
'broj_dresa': broj_dresa,
'pozicija': pozicija,
'slika_url': slika,
'captain': captain,
'starter': is_starter,
'events': events,
'pogodaka': cnt_gol,
'zuti_kartoni': cnt_zuti,
'crveni_kartoni': cnt_crveni,
'minute': minutes,
})
return team_name, players
home_div = soup.find('div', class_='homeTeam')
away_div = soup.find('div', class_='awayTeam')
home_name, home_players = parse_team_div(home_div)
away_name, away_players = parse_team_div(away_div)
if home_name: out['teams'][home_name] = home_players
if away_name: out['teams'][away_name] = away_players
# Logo URLs
if home_div:
h_img = home_div.find('img')
out['meta']['klub_dom_logo'] = h_img.get('src') if h_img else None
if away_div:
a_img = away_div.find('img')
out['meta']['klub_gost_logo'] = a_img.get('src') if a_img else None
out['meta']['klub_dom'] = home_name
out['meta']['klub_gost'] = away_name
# Date/time, viewership, score, competition
body_text = soup.get_text(' ', strip=True)
dm = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})\.?\s*(\d{1,2}:\d{2})', body_text)
if dm:
try:
d_parts = dm.group(1).split('.')
out['meta']['datum'] = f"{d_parts[2]}-{d_parts[1].zfill(2)}-{d_parts[0].zfill(2)}"
out['meta']['vrijeme'] = dm.group(2)
except: pass
gm = re.search(r'Gledatelja:\s*(\d+)', body_text)
if gm: out['meta']['gledatelja'] = int(gm.group(1))
rm = re.search(r'(\d+):(\d+)', out.get('title',''))
if rm:
out['meta']['rezultat'] = f"{rm.group(1)}:{rm.group(2)}"
nat_match = out.get('title','').split(',')
if len(nat_match) > 1: out['meta']['natjecanje'] = nat_match[-1].strip()
return out
def cmd_klub(klub_id_db: int, max_matches: int = 999):
"""Scrape klub: club page → all matches → for our team upsert player + utakmice_log row with full stats."""
with conn() as c:
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cu.execute("SELECT id, naziv, hns_klub_id, hns_slug FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
klub = cu.fetchone()
if not klub or not klub['hns_klub_id']:
log.error(f"Klub #{klub_id_db}: nema hns_klub_id"); return 0
klub_url = f"{BASE}/klubovi/{klub['hns_klub_id']}/{klub['hns_slug'] or 'k'}/"
log.info(f"Klub: {klub['naziv']}{klub_url}")
try: html = fetch(klub_url)
except Exception as e: log.error(f"klub fetch failed: {e}"); return 0
soup = BeautifulSoup(html, 'html.parser')
match_ids = []
for a in soup.find_all('a', href=re.compile(r'/utakmice/(\d+)/')):
mm = re.search(r'/utakmice/(\d+)/', a['href'])
if mm and mm.group(1) not in match_ids:
match_ids.append(mm.group(1))
log.info(f" found {len(match_ids)} matches; processing up to {max_matches}")
klub_naziv_low = klub['naziv'].lower()
seen_pids = set()
matches_logged = 0
for mid in match_ids[:max_matches]:
try:
mhtml = fetch(f"{BASE}/utakmice/{mid}/")
md = parse_match(mhtml, match_url=f"{BASE}/utakmice/{mid}/")
except Exception as e: log.error(f" match {mid}: {e}"); continue
if not md.get('teams'):
log.warning(f" match {mid}: no teams parsed"); continue
# Find which team (home or away) is OURS — use looser match (incl. token overlap)
roster = []; matched_team = None
for tn, players in md['teams'].items():
tn_low = tn.lower()
# try exact substring both directions
if klub_naziv_low in tn_low or tn_low in klub_naziv_low:
roster = players; matched_team = tn; break
# token-set overlap (e.g. "NK Krk" vs "NK Krk Krk" or "NK Vihor" vs "NK Vihor (B)")
tokens_klub = set(re.split(r'\s+', re.sub(r'[^\w]',' ', klub_naziv_low)))
tokens_team = set(re.split(r'\s+', re.sub(r'[^\w]',' ', tn_low)))
tokens_klub.discard(''); tokens_team.discard('')
common = tokens_klub & tokens_team
# Drop generic tokens
generic = {'nk','hnk','klub','nogometni'}
common_strong = common - generic
if len(common_strong) >= 1 and (klub_naziv_low.split()[-1] in tn_low or tn_low.split()[-1] in klub_naziv_low):
roster = players; matched_team = tn
log.info(f" fuzzy match: {klub['naziv']}{tn}")
break
if not roster:
continue # silently skip non-matching
meta = md.get('meta', {})
team_keys = list(md['teams'].keys())
klub_dom = team_keys[0] if team_keys else None
klub_gost = team_keys[1] if len(team_keys) > 1 else None
with conn() as c:
cu = c.cursor()
for pl in roster:
if not pl.get('hns_pid'): continue
seen_pids.add(pl['hns_pid'])
name = pl['ime_prezime'] or ''
parts = name.rsplit(' ', 1)
ime = parts[0] if len(parts) > 1 else name
prezime = parts[1] if len(parts) > 1 else ''
slug = slugify(name)
src_url = f"{BASE}/igraci/{pl['hns_pid']}/{slug}/"
cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(pl['hns_pid']),))
row = cu.fetchone()
if row:
cid = row[0]
cu.execute("""UPDATE pgz_sport.clanovi
SET ime=%s, prezime=%s, slika_url=COALESCE(NULLIF(%s,''), slika_url),
broj_dresa=COALESCE(%s, broj_dresa),
pozicija=COALESCE(%s, pozicija),
klub_id=%s, source_url=%s, source_synced_at=now(), slug=%s
WHERE id=%s""",
(ime, prezime, pl.get('slika_url') or '', pl.get('broj_dresa'),
pl.get('pozicija'), klub_id_db, src_url, slug, cid))
else:
cu.execute("""INSERT INTO pgz_sport.clanovi
(klub_id, ime, prezime, slika_url, broj_dresa, pozicija,
source, source_id, source_url, source_synced_at, slug)
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
RETURNING id""",
(klub_id_db, ime, prezime, pl.get('slika_url'), pl.get('broj_dresa'),
pl.get('pozicija'), str(pl['hns_pid']), src_url, slug))
cid = cu.fetchone()[0]
cu.execute("""INSERT INTO pgz_sport.utakmice_log
(clan_id, source, source_match_id, source_url, datum, vrijeme,
natjecanje, klub_dom, klub_dom_logo, klub_gost, klub_gost_logo, rezultat, za_klub_id,
pogodaka, zuti_kartoni, crveni_kartoni, minute, zapocet_kao_starter)
VALUES (%s,'hns_semafor',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
ON CONFLICT (source, source_match_id, clan_id) DO UPDATE SET
datum=EXCLUDED.datum, rezultat=EXCLUDED.rezultat,
za_klub_id=EXCLUDED.za_klub_id,
pogodaka=EXCLUDED.pogodaka, zuti_kartoni=EXCLUDED.zuti_kartoni,
crveni_kartoni=EXCLUDED.crveni_kartoni, minute=EXCLUDED.minute,
zapocet_kao_starter=EXCLUDED.zapocet_kao_starter,
klub_dom_logo=EXCLUDED.klub_dom_logo, klub_gost_logo=EXCLUDED.klub_gost_logo""",
(cid, mid, f"{BASE}/utakmice/{mid}/",
meta.get('datum'), meta.get('vrijeme'),
meta.get('natjecanje'), klub_dom, meta.get('klub_dom_logo'),
klub_gost, meta.get('klub_gost_logo'),
meta.get('rezultat'), klub_id_db,
pl.get('pogodaka',0), pl.get('zuti_kartoni',0),
pl.get('crveni_kartoni',0), pl.get('minute'),
pl.get('starter', True)))
c.commit()
matches_logged += 1
log.info(f"Klub {klub['naziv']} done: {len(seen_pids)} unique players, {matches_logged} matches logged")
return len(seen_pids)
if __name__ == '__main__':
if len(sys.argv) < 2:
print(__doc__); sys.exit(1)
cmd = sys.argv[1]
if cmd == 'seed':
print(cmd_seed())
elif cmd == 'player':
cid = cmd_player(int(sys.argv[2]))
print(f"clan_id={cid}")
elif cmd == 'daily':
cmd_daily()
elif cmd == 'klub':
if len(sys.argv) < 3:
print("Usage: klub <db_klub_id> [max_matches]"); sys.exit(2)
max_m = int(sys.argv[3]) if len(sys.argv) > 3 else 1
cmd_klub(int(sys.argv[2]), max_matches=max_m)
elif cmd == 'klub_all':
# Scrape all PGŽ klubovi with hns_klub_id set
with conn() as c:
cu = c.cursor()
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id IS NOT NULL ORDER BY id")
kids = [r[0] for r in cu.fetchall()]
log.info(f"Scraping rosters for {len(kids)} klubova…")
for kid in kids:
try: cmd_klub(kid, max_matches=999)
except Exception as e: log.error(f"klub {kid}: {e}")
else:
print(f"Unknown: {cmd}"); sys.exit(2)
+168
View File
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
"""
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
Path: /opt/pgz-sport/scrapers/hos_scraper.py
"""
import requests, re, json, psycopg2, html as ihtml
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
def parse_table(html, table_idx):
"""Parse a single table - return rows."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
if table_idx >= len(tables): return []
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
out = []
for row in rows:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
if not clean or not clean[0]: continue
# Skip header
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
try:
poz_match = re.match(r'(\d+)', clean[0])
if not poz_match: continue
poz = int(poz_match.group(1))
if len(clean) < 6: continue
klub = clean[2] if clean[2] else clean[1]
if not klub: continue
out.append({
"poz": poz,
"klub": klub,
"uk": int(clean[3]),
"pob": int(clean[4]),
"por": int(clean[5]),
"bod": int(clean[6]) if len(clean) > 6 else 0,
"ner": 0,
})
except (ValueError, IndexError):
continue
return out
def find_table_titles(html):
"""Find h2/h3 headers + their position to associate with following tables."""
# Use regex to find heading + nearest table
out = []
# Finds positions of titles
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
if title and len(title) > 5:
out.append((m.start(), title))
return out
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
# Get main page
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
html = r.text
print(f"Length: {len(html)}")
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
print(f"Tables: {len(tables)}")
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
# Best heuristic: title closest before each table
title_positions = find_table_titles(html)
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
table_with_title = []
for tp in table_positions:
# Find closest title before
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
if candidates:
title = candidates[-1][1]
else:
title = "Unknown"
table_with_title.append((tp, title))
print("\n=== Table titles ===")
for i, (tp, t) in enumerate(table_with_title[:8]):
print(f" Table {i+1}: {t[:80]}")
# Manual mapping based on Damir's request: extract all visible league tables
# Looking at output: Tables 1-5 with kluba names
LEAGUES_2025_26 = [
# Idx, Name, Razina, Spol
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
]
total_inserted = 0
pgz_seen = set()
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
rows = parse_table(html, idx)
if not rows: continue
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
for r in rows[:3]:
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
nr = cu.fetchone()
if nr: natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
RETURNING id""", (natj_naziv, razina))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
for r in rows:
klub_id = None
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
json.dumps({"spol": spol})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HOS lige ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hos_cvf' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HOS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
if __name__ == "__main__":
main()
+168
View File
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
"""
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
Path: /opt/pgz-sport/scrapers/hos_scraper.py
"""
import requests, re, json, psycopg2, html as ihtml
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
def parse_table(html, table_idx):
"""Parse a single table - return rows."""
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
if table_idx >= len(tables): return []
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
out = []
for row in rows:
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
if not clean or not clean[0]: continue
# Skip header
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
try:
poz_match = re.match(r'(\d+)', clean[0])
if not poz_match: continue
poz = int(poz_match.group(1))
if len(clean) < 6: continue
klub = clean[2] if clean[2] else clean[1]
if not klub: continue
out.append({
"poz": poz,
"klub": klub,
"uk": int(clean[3]),
"pob": int(clean[4]),
"por": int(clean[5]),
"bod": int(clean[6]) if len(clean) > 6 else 0,
"ner": 0,
})
except (ValueError, IndexError):
continue
return out
def find_table_titles(html):
"""Find h2/h3 headers + their position to associate with following tables."""
# Use regex to find heading + nearest table
out = []
# Finds positions of titles
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
if title and len(title) > 5:
out.append((m.start(), title))
return out
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
# Get main page
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
html = r.text
print(f"Length: {len(html)}")
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
print(f"Tables: {len(tables)}")
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
# Best heuristic: title closest before each table
title_positions = find_table_titles(html)
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
table_with_title = []
for tp in table_positions:
# Find closest title before
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
if candidates:
title = candidates[-1][1]
else:
title = "Unknown"
table_with_title.append((tp, title))
print("\n=== Table titles ===")
for i, (tp, t) in enumerate(table_with_title[:8]):
print(f" Table {i+1}: {t[:80]}")
# Manual mapping based on Damir's request: extract all visible league tables
# Looking at output: Tables 1-5 with kluba names
LEAGUES_2025_26 = [
# Idx, Name, Razina, Spol
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
]
total_inserted = 0
pgz_seen = set()
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
rows = parse_table(html, idx)
if not rows: continue
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
for r in rows[:3]:
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
# Get/create natjecanje
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
nr = cu.fetchone()
if nr: natj_id = nr[0]
else:
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
RETURNING id""", (natj_naziv, razina))
natj_id = cu.fetchone()[0]
print(f" Created natjecanje id={natj_id}")
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
for r in rows:
klub_id = None
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
kr = cu.fetchone()
if kr:
klub_id = kr[0]
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
json.dumps({"spol": spol})))
total_inserted += 1
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
FROM pgz_sport.natjecanja n
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
print("\n=== HOS lige ===")
for r in cu.fetchall():
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
FROM pgz_sport.natjecanja_tablice t
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
WHERE t.source='hos_cvf' AND k.region='PGŽ'
ORDER BY n.naziv, t.pozicija""")
print("\n=== PGŽ klubovi u HOS ===")
for r in cu.fetchall():
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
if __name__ == "__main__":
main()
+178
View File
@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""HVS Riznica - parse direct HTML, no Playwright. Extract champions per season."""
import re, requests, psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
CATEGORIES = [
("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"),
("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"),
("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"),
("Kup Hrvatske - žene", "kup-hrvatske-zene"),
("Trofej Toni Nardelli", "trofej-toni-nardelli"),
("Vaterpolist godine", "vaterpolist"),
("Vaterpolistica godine", "vaterpolistica"),
]
UA = "RiNET-Civic/1.0 (https://rinet.one)"
HDR = {"User-Agent": UA}
inserted_total = 0
all_winners = []
for label, slug in CATEGORIES:
url = f"https://hvs.hr/riznica/{slug}/"
print(f"\n=== {label} ({slug}) ===", flush=True)
try:
r = requests.get(url, headers=HDR, timeout=15)
if r.status_code != 200:
print(f" HTTP {r.status_code}"); continue
html = r.text
# Extract slides - each riznica__slide block
# Pattern: extract slide blocks with championship name + year
# Each slide has competition name OR medal name (for Vaterpolist categories)
slides_re = re.compile(
r'<div class="riznica__slide[^"]*"[^>]*>(.+?)(?=<div class="riznica__slide|<section|<footer)',
re.DOTALL
)
slides = slides_re.findall(html)
print(f" Slides: {len(slides)}", flush=True)
# Each slide we look for: champion name + year
# But year shows separately in <h2 class="riznica__competitions__current"> for current visible
# Actually each slide contains its own competition details
# Better approach: extract all competition name spans in order
comp_names = re.findall(r'<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', html)
# And champion images for Vaterpolist (medal__name)
medal_names = re.findall(r'<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', html)
# And years
years = re.findall(r'<h2 class="riznica__competitions__current"[^>]*>\s*([^<]+?)\s*<', html)
# Or in name spans
years_alt = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html)
# Also single-year format (Trofej, Vaterpolist often have just single year)
years_single = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\.)\s*<', html)
all_years = list(set(years + years_alt + years_single))
all_years_sorted = sorted([y.strip() for y in all_years if y.strip()])
print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True)
if comp_names: print(f" Sample champ: {comp_names[:3]}")
if medal_names: print(f" Sample medal: {medal_names[:3]}")
if all_years_sorted: print(f" Years range: {all_years_sorted[0]}{all_years_sorted[-1]}")
# The names may be aligned with years sequentially
# Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards)
names = comp_names if comp_names else medal_names
names = [n.strip() for n in names if n.strip()]
# The champions in HTML order represent the seasons in display order
# Map them to years - assume index alignment with sorted years if same length
# Otherwise, the page shows multiple slides — same name may repeat
# For each non-empty name, create entry
# Best guess: names list and years list are PARALLEL (same length, in order on page)
# Pages show all-time history, so years_alt (with format) is most reliable
# Smart: if len(names) matches len(all_years_sorted), pair them
# Else create entries with name+year separately and link by index
# Even better: each "slide" block contains 1 name + 1 year contextually nearby
# Find pairs by extracting full slides and matching internal patterns
slide_pat = re.compile(
r'(?:<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<|<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<)'
r'.*?(?:<h2 class="riznica__competitions__current"[^>]*>\s*([0-9./]+)|<h2 class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+))',
re.DOTALL
)
# That regex too complex - use simpler split approach
# Split by slide divs
all_records = []
slide_blocks = re.split(r'<div class="riznica__slide(?:\s+is-visible)?">', html)
for blk in slide_blocks[1:]:
# Find name (champion or medal_name)
name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk)
if not name_m:
name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk)
year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk)
if not year_m:
year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk)
# Image
img_m = re.search(r'<img src="([^"]+)"', blk)
if name_m and year_m:
all_records.append({
"name": name_m.group(1).strip(),
"year": year_m.group(1).strip(),
"img": img_m.group(1) if img_m else None
})
print(f" Parsed records: {len(all_records)}", flush=True)
if all_records:
for rec in all_records[:3]: print(f" {rec['year']}: {rec['name']}")
# Insert into DB
for rec in all_records:
year = rec['year']
champ = rec['name']
# Find klub_id
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) LIKE LOWER(%s) OR LOWER(naziv) = LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN sport='vaterpolo' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(f'%{champ}%', champ))
kid_row = cu.fetchone()
klub_id = kid_row[0] if kid_row else None
try:
cu.execute("""INSERT INTO pgz_sport.klub_sezona
(klub_id, klub_naziv, sezona, natjecanje, plasiranje, trofej, source, source_url)
VALUES (%s, %s, %s, %s, 1, %s, 'hvs_riznica', %s)
ON CONFLICT DO NOTHING""",
(klub_id, champ, year, label,
f'1. mjesto - {label} {year}', url))
if cu.rowcount > 0:
inserted_total += 1
except Exception as e:
pass
all_winners.append({"category": label, "count": len(all_records), "records": all_records})
except Exception as e:
print(f" EXC: {e}")
print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===")
# Audit log
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""",
(f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',))
# Top winners
cu.execute("""SELECT klub_naziv, count(*) AS naslova
FROM pgz_sport.klub_sezona
WHERE source='hvs_riznica'
GROUP BY klub_naziv
ORDER BY count(*) DESC LIMIT 15""")
print("\n=== TOP HVS prvaci/medalisti ===")
for r in cu.fetchall():
print(f" {r[1]:>3}× {r[0]}")
# PGŽ-relevant
cu.execute("""SELECT k.naziv, count(ks.*) AS naslova
FROM pgz_sport.klub_sezona ks
JOIN pgz_sport.klubovi k ON k.id = ks.klub_id
WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL
GROUP BY k.naziv
ORDER BY count(*) DESC""")
print("\n=== PGŽ klubovi sa HVS naslovima ===")
for r in cu.fetchall():
print(f" {r[1]:>3}× {r[0]}")
conn.close()
+117
View File
@@ -0,0 +1,117 @@
#!/usr/bin/env python3
import psycopg2, json, re
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
totals = json.load(open('/opt/pgz-sport/data/sport_totals_2025.json'))
print(f"Sport totals 2025: {len(totals)}")
SPORT_TO_SAVEZ_KEYS = {
'atletika':'atletski','biciklizam':'biciklist','boćanje':'boćar','boks':'boksač',
'gimnastika':'gimnast','jedriličarstvo':'jedrili','judo':'judo','karate':'karate',
'kendo':'kendo','kickboxing':'kickbox','košarka':'košark','kuglanje':'kuglač',
'nogomet':'nogomet','odbojka':'odbojkaš','parasport':'parasport','ples':'ples',
'plivanje':'plivačk','ronilaštvo':'ronila','rukomet':'rukometn',
'sinkronizirano plivanje':'plivačk','skijanje':'skijaš','sport gluhih':'gluh',
'sportski ribolov':'ribolov','sportsko penjanje':'penjač','stolni tenis':'stolnotenis',
'streličarstvo':'streličar','streljaštvo':'streljač','tenis':'teniski',
'triatlon':'triatlon','vaterpolo':'vaterpol','veslanje':'veslač',
}
cu.execute("SELECT id, naziv FROM pgz_sport.savezi WHERE naziv ILIKE '%PGŽ%' OR naziv ILIKE '%Primorsko%'")
all_savezi = cu.fetchall()
print(f"PGŽ savezi: {len(all_savezi)}")
savez_map = {}
for sport_lc, key in SPORT_TO_SAVEZ_KEYS.items():
for sid, naziv in all_savezi:
if key.lower() in naziv.lower():
savez_map[sport_lc] = sid; break
# Update statistika_saveza for 2025
n_upd = 0
for sport_lc, data in totals.items():
sid = savez_map.get(sport_lc)
if not sid: continue
cu.execute("SELECT id FROM pgz_sport.statistika_saveza WHERE savez_id=%s AND godina=%s", (sid, 2025))
e = cu.fetchone()
if e:
cu.execute("UPDATE pgz_sport.statistika_saveza SET registriranih=%s WHERE id=%s", (data['total'], e[0]))
else:
cu.execute("INSERT INTO pgz_sport.statistika_saveza (savez_id, godina, registriranih) VALUES (%s,%s,%s)",
(sid, 2025, data['total']))
n_upd += 1
print(f"Statistika_saveza 2025 updated: {n_upd}")
# Kategorizirani
KAT = json.load(open('/opt/pgz-sport/data/kategorizirani_2025.json'))
print(f"\nKategorizirani: {len(KAT)}")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija TEXT")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija_od DATE")
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija_do DATE")
n_matched = 0; n_inserted = 0; n_dup = 0
seen = set()
def parse_d(s):
try:
d, m, y = s.split('.')
return f"{y}-{int(m):02d}-{int(d):02d}"
except: return None
for k in KAT:
ime = k['ime'].strip()
prezime = k['prezime'].strip()
sport = k['sport'].strip().lower()
klub = k['klub'].strip().strip('"').strip()
mjesto = k['mjesto'].strip()
hoo_kat = k['hoo_kategorija']
od = parse_d(k['vrijedi_od'])
do = parse_d(k['vrijedi_do'])
key = (ime.lower(), prezime.lower(), sport)
if key in seen: n_dup += 1; continue
seen.add(key)
cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s)
AND (sport IS NULL OR LOWER(sport)=LOWER(%s)) LIMIT 1""", (ime, prezime, sport))
row = cu.fetchone()
if row:
cu.execute("""UPDATE pgz_sport.clanovi SET hoo_kategorija=%s, hoo_kategorija_od=%s,
hoo_kategorija_do=%s, sport=COALESCE(sport, %s) WHERE id=%s""",
(hoo_kat, od, do, sport, row[0]))
n_matched += 1
else:
klub_id = None
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) LIKE LOWER(%s) AND region IS NOT NULL LIMIT 1",
(f"%{klub[:30]}%",))
kr = cu.fetchone()
if kr: klub_id = kr[0]
cu.execute("""INSERT INTO pgz_sport.clanovi (ime, prezime, sport, mjesto_rodenja, klub_id,
hoo_kategorija, hoo_kategorija_od, hoo_kategorija_do)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)""",
(ime, prezime, sport, mjesto, klub_id, hoo_kat, od, do))
n_inserted += 1
print(f"Matched: {n_matched}, Inserted: {n_inserted}, Dup: {n_dup}")
cu.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE hoo_kategorija IS NOT NULL")
print(f"\nSportaša s HOO kategorijom: {cu.fetchone()[0]}")
cu.execute("SELECT hoo_kategorija, count(*) FROM pgz_sport.clanovi WHERE hoo_kategorija IS NOT NULL GROUP BY hoo_kategorija ORDER BY hoo_kategorija")
for r in cu.fetchall():
print(f" HOO {r[0]}: {r[1]}")
cu.execute("SELECT count(*) FROM pgz_sport.clanovi")
print(f"\nUkupno sportaša: {cu.fetchone()[0]}")
# Sport totals 2025 result
cu.execute("""SELECT s.naziv, ss.registriranih FROM pgz_sport.statistika_saveza ss
JOIN pgz_sport.savezi s ON s.id=ss.savez_id
WHERE ss.godina=2025 AND ss.registriranih > 0 ORDER BY ss.registriranih DESC LIMIT 12""")
print("\n2025 statistika_saveza top 12:")
for r in cu.fetchall():
print(f" {r[0]:50} {r[1]:>6}")
conn.close()
+114
View File
@@ -0,0 +1,114 @@
import psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
DOCS = [
('Zakon o porezu na dobit','Porezni tretman sportskih udruga i klubova.','zakon','RH','Sabor RH',None,'NN 177/04+','https://www.zakon.hr/z/99/Zakon-o-porezu-na-dobit',['porez','udruga'],'2004-12-23'),
('Zakon o PDV-u — sport','PDV tretman ulaznica i sponzorstava.','zakon','RH','Sabor RH',None,'NN 73/13','https://www.zakon.hr/z/186/Zakon-o-porezu-na-dodanu-vrijednost',['PDV','ulaznice'],'2013-06-19'),
('Zakon o radu — sportaš/trener','Radni odnosi profesionalnih sportaša i trenera.','zakon','RH','Sabor RH',None,'NN 93/14, 151/22, 64/23','https://www.zakon.hr/z/307/Zakon-o-radu',['radni odnos','trener'],'2014-07-25'),
('Zakon o pravu na pristup informacijama','Obveze JLS za objavu o financiranju sporta.','zakon','RH','Sabor RH',None,'NN 25/13, 85/15, 69/22','https://www.zakon.hr/z/126/Zakon-o-pravu-na-pristup-informacijama',['transparentnost'],'2013-02-15'),
('Zakon o volonterstvu','Volonterski rad u sportskim klubovima.','zakon','RH','Sabor RH',None,'NN 58/07, 22/13, 84/21','https://www.zakon.hr/z/220/Zakon-o-volonterstvu',['volonter'],'2007-06-08'),
('Pravilnik o registru sportskih udruga MTS','MTS javni registar.','pravilnik','RH','MTS',None,'NN 31/24','https://mtus.gov.hr/sport-2625/registri-sportskih-udruga/22516',['registar','MTS'],'2024-03-15'),
('Pravilnik o stručnim poslovima u sportu','Kvalifikacije i licence stručnog kadra.','pravilnik','RH','MTS',None,'NN 89/23','https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html',['licenca','trener'],'2023-08-04'),
('Pravilnik o statusu sportaša s posebnim statusom','Vrhunski sportaši — stipendije, doprinosi.','pravilnik','RH','MTS',None,'NN 14/23','https://mtus.gov.hr/sport/2625',['vrhunski','staž'],'2023-02-01'),
('Pravilnik o korištenju olimpijske oznake','HOO pravila o olimpijskim simbolima.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['olimpijski'],'2020-01-01'),
('Pravilnik o članicama HOO-a','Kriteriji za nacionalne saveze.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/clanstvo',['HOO','clanstvo'],'2022-01-01'),
('Etički kodeks HOO-a','Etička načela u sportu.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['etika'],'2021-01-01'),
('Pravilnik HOO o sportskim stipendijama','HOO program stipendiranja.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/sport-u-hrvatskoj/stipendije',['stipendija'],'2023-01-01'),
('Pravilnik o nagradama HOO','HOO godišnje nagrade.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['nagrade'],'2022-01-01'),
('Statut HNS-a','Hrvatski nogometni savez statut.','statut','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS'],'2023-01-01'),
('Statut HRS-a','Hrvatski rukometni savez statut.','statut','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS'],'2023-01-01'),
('Statut HKS-a','Hrvatski košarkaški savez statut.','statut','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS'],'2023-01-01'),
('Statut HBS Boćarski','Hrvatski boćarski savez.','statut','Savez','HBS','boćanje',None,'https://hrvatski-bocarski-savez.hr/savez/dokumenti/',['HBS','boćanje'],'2022-01-01'),
('Statut HOS Odbojkaški','Hrvatski odbojkaški savez.','statut','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS'],'2022-01-01'),
('Statut HŠS Šahovski','Hrvatski šahovski savez.','statut','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','FIDE'],'2022-01-01'),
('Statut HVS Veslački','Hrvatski veslački savez.','statut','Savez','HVS','veslanje',None,'https://www.veslanje.hr/dokumenti',['HVS'],'2021-01-01'),
('Statut HJS Jedriličarski','Hrvatski jedriličarski savez.','statut','Savez','HJS','jedriličarstvo',None,'https://www.hjs.hr/dokumenti',['HJS'],'2022-01-01'),
('Statut HAS Atletski','Hrvatski atletski savez.','statut','Savez','HAS','atletika',None,'https://www.has.hr/index.php/dokumenti',['HAS','World Athletics'],'2023-01-01'),
('Statut HVPS Vaterpolski','Hrvatski vaterpolski savez.','statut','Savez','HVPS','vaterpolo',None,'https://hvs.hr/dokumenti',['HVPS'],'2022-01-01'),
('Statut HPS Plivacki','Hrvatski plivacki savez.','statut','Savez','HPS','plivanje',None,'https://hps.com.hr/dokumenti',['HPS','FINA'],'2022-01-01'),
('Statut HTS Teniski','Hrvatski teniski savez.','statut','Savez','HTS','tenis',None,'https://hts.hr/dokumenti/',['HTS','ITF'],'2023-01-01'),
('Statut HSTS Stolnoteniski','Hrvatski stolnoteniski savez.','statut','Savez','HSTS','stolni tenis',None,'https://hsts.hr/dokumenti/',['HSTS','ITTF'],'2022-01-01'),
('Statut HBS-UCI Biciklistički','Hrvatski biciklistički savez.','statut','Savez','HBS-UCI','biciklizam',None,'https://www.hbs.hr/dokumenti',['biciklizam','UCI'],'2022-01-01'),
('Statut HKZ Karate','Hrvatski karate savez.','statut','Savez','HKZ','karate',None,'https://www.hkz.hr/dokumenti',['karate','WKF'],'2022-01-01'),
('Statut HJSav Judo','Hrvatski judo savez.','statut','Savez','HJSav','judo',None,'https://www.judosavez.hr/dokumenti',['judo','IJF'],'2022-01-01'),
('Statut HTKDS Taekwondo','Hrvatski taekwondo savez.','statut','Savez','HTKDS','taekwondo',None,'https://www.taekwondo.hr/dokumenti/',['taekwondo','WT'],'2022-01-01'),
('Statut PGZ','Temeljni akt PGŽ-a.','statut','PGZ','PGŽ',None,None,'https://www.pgz.hr/sluzbene-novine',['PGŽ','statut'],'2021-04-01'),
('Plan razvoja PGŽ 2021-2027','Plan razvoja županije.','strategija','PGZ','PGŽ',None,None,'https://www.pgz.hr/strategije',['plan','EU fondovi'],'2021-12-01'),
('Pravilnik o sufinanciranju opreme PGŽ','Sredstva za opremu klubova.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['oprema'],'2023-01-01'),
('Pravilnik o sufinanciranju građevina PGŽ','Investicijska sredstva za građevine.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['građevina'],'2023-01-01'),
('Odluka o nagradama u sportu PGŽ','Sustav nagrada.','odluka','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['nagrada'],'2023-01-01'),
('Strategija sporta Grada Rijeke 2020-2030','Gradski plan razvoja sporta.','strategija','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/strategija-sporta',['Rijeka'],'2020-12-01'),
('Pravilnik javne potrebe sport Rijeka','Postupak dodjele sredstava.','pravilnik','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/pravilnici',['Rijeka','financiranje'],'2023-01-01'),
('Strategija sporta Grada Opatije','Gradska strategija.','strategija','Grad Rijeka','Grad Opatija',None,None,'https://www.opatija.hr/sport',['Opatija'],'2021-01-01'),
('Program sport Crikvenica','Crikvenički program.','program','Grad Rijeka','Grad Crikvenica',None,None,'https://www.crikvenica.hr/programi',['Crikvenica'],'2024-01-01'),
('Program sport Krk','Krčki program.','program','Grad Rijeka','Grad Krk',None,None,'https://www.grad-krk.hr/programi',['Krk'],'2024-01-01'),
('Program sport Mali Lošinj','Lošinjski program.','program','Grad Rijeka','Grad Mali Lošinj',None,None,'https://www.mali-losinj.hr/programi',['Lošinj'],'2024-01-01'),
('Program sport Delnice','Delnički program.','program','Grad Rijeka','Grad Delnice',None,None,'https://www.delnice.hr/programi',['Delnice'],'2024-01-01'),
('Program sport Vrbovsko','Vrbovski program.','program','Grad Rijeka','Grad Vrbovsko',None,None,'https://www.vrbovsko.hr/programi',['Vrbovsko'],'2024-01-01'),
('Program sport Novi Vinodolski','Novovinodolski program.','program','Grad Rijeka','Grad Novi Vinodolski',None,None,'https://www.novi-vinodolski.hr/programi',['Novi Vinodolski'],'2024-01-01'),
('Program sport Kraljevica','Kraljevički program.','program','Grad Rijeka','Grad Kraljevica',None,None,'https://www.kraljevica.hr/programi',['Kraljevica'],'2024-01-01'),
('Program sport Bakar','Bakarski program.','program','Grad Rijeka','Grad Bakar',None,None,'https://www.bakar.hr/programi',['Bakar'],'2024-01-01'),
('Program sport Cres','Creski program.','program','Grad Rijeka','Grad Cres',None,None,'https://www.cres.hr/programi',['Cres'],'2024-01-01'),
('Program sport Rab','Rabski program.','program','Grad Rijeka','Grad Rab',None,None,'https://www.rab.hr/programi',['Rab'],'2024-01-01'),
('Program sport Kastav','Kastavski program.','program','Grad Rijeka','Grad Kastav',None,None,'https://www.kastav.hr/programi',['Kastav'],'2024-01-01'),
('WADA Lista zabranjenih tvari','WADA Prohibited List godišnje.','pravilnik','EU','WADA',None,None,'https://www.wada-ama.org/en/prohibited-list',['WADA'],'2024-01-01'),
('Pravilnik HASMS o TUE','TUE postupak u Hrvatskoj.','pravilnik','RH','HASMS',None,None,'https://hasms.hr/anti-doping/dokumenti/',['TUE','HASMS'],'2023-01-01'),
('European Sports Charter','CoE povelja o sportu.','pravilnik','EU','Council of Europe',None,None,'https://rm.coe.int/european-sports-charter-1992',['CoE'],'2021-10-13'),
('Macolin Convention','CoE protiv namještanja.','pravilnik','EU','Council of Europe',None,None,'https://www.coe.int/en/web/sport/macolin-convention',['namještanje'],'2014-09-18'),
('UEFA Financial Sustainability','Financijski Fair Play.','pravilnik','EU','UEFA','nogomet',None,'https://documents.uefa.com/v/u/MFFvQjlRCJF7RJYIoyzMRA',['UEFA','FFP'],'2022-06-01'),
('Pravilnik o radu školskih sportskih društava','ŠŠD organizacija.','pravilnik','RH','MZO + MTS',None,None,'https://mzo.gov.hr/sport',['ŠŠD'],'2022-01-01'),
('Pravilnik za status parasportaša','Klasifikacija parasportaša.','pravilnik','RH','HPO',None,None,'https://www.hpo.hr/Dokumenti',['parasport','HPO'],'2022-01-01'),
('IPC klasifikacija parasportaša','IPC klasifikacije po sportu.','pravilnik','EU','IPC',None,None,'https://www.paralympic.org/classification',['IPC','parasport'],'2023-01-01'),
('Zakon o lovstvu','Temeljni zakon o lovstvu.','zakon','RH','Sabor RH','lov','NN 99/18, 32/19, 153/22','https://www.zakon.hr/z/127/Zakon-o-lovstvu',['lov','divljač'],'2018-11-09'),
('Pravilnik o lovostaju','Lovne sezone.','pravilnik','RH','Min. poljoprivrede','lov',None,'https://mps.gov.hr/lovstvo/dokumenti',['lovostaj'],'2022-01-01'),
('Pravilnik o lovniku','Pravila lova, oprema.','pravilnik','RH','Min. poljoprivrede','lov',None,'https://mps.gov.hr/lovstvo/dokumenti',['lovnik'],'2022-01-01'),
('Pravila igre Nogomet IFAB','IFAB Laws of the Game.','pravilnik','EU','IFAB/FIFA','nogomet',None,'https://www.theifab.com/laws/',['IFAB','FIFA'],'2024-07-01'),
('Pravila igre Rukomet IHF','IHF Rules.','pravilnik','EU','IHF','rukomet',None,'https://www.ihf.info/regulations-documents',['IHF'],'2022-07-01'),
('Pravila igre Košarka FIBA','FIBA Official Rules.','pravilnik','EU','FIBA','košarka',None,'https://www.fiba.basketball/basketballrules',['FIBA'],'2024-10-01'),
('Propisi natjecanja HBS','HBS volo/raffa/petanque.','pravilnik_savez','Savez','HBS','boćanje',None,'https://hrvatski-bocarski-savez.hr/savez/dokumenti/',['HBS','natjecanje'],'2024-01-01'),
('Propisi natjecanja HVS','HVS regate, kategorije.','pravilnik_savez','Savez','HVS','veslanje',None,'https://www.veslanje.hr/dokumenti',['HVS'],'2024-01-01'),
('Propisi natjecanja HJS','HJS regate, RRS.','pravilnik_savez','Savez','HJS','jedriličarstvo',None,'https://www.hjs.hr/dokumenti',['HJS'],'2024-01-01'),
('Propisi natjecanja HŠS','HŠS FIDE pravila.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','FIDE'],'2024-01-01'),
]
def main():
conn = psycopg2.connect(**DB)
conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT LOWER(COALESCE(title, '')) FROM pgz_sport.dokumenti")
existing = set(r[0] for r in cu.fetchall())
n_added = 0; n_skipped = 0; n_err = 0
for d in DOCS:
if d[0].lower() in existing:
n_skipped += 1; continue
try:
cu.execute('''INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik,
izvor_url, kljucne_rijeci, izdano_datum, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)''', d)
n_added += 1
existing.add(d[0].lower())
except Exception as e:
n_err += 1
print(f' err {d[0][:40]}: {e}')
print(f'Added: {n_added}, Skipped: {n_skipped}, Errors: {n_err}')
cu.execute('SELECT count(*) FROM pgz_sport.dokumenti')
print(f'TOTAL: {cu.fetchone()[0]}')
cu.execute('SELECT razina, count(*) FROM pgz_sport.dokumenti GROUP BY razina ORDER BY count(*) DESC')
print('\nPo razini:')
for r in cu.fetchall():
print(f' {r[0]:<15} {r[1]}')
cu.execute('SELECT tip, count(*) FROM pgz_sport.dokumenti GROUP BY tip ORDER BY count(*) DESC')
print('\nPo tipu:')
for r in cu.fetchall():
print(f' {r[0]:<25} {r[1]}')
conn.close()
if __name__ == '__main__':
main()
+151
View File
@@ -0,0 +1,151 @@
#!/usr/bin/env python3
import psycopg2
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
DOCS = [
# ═══ PGŽ ZAJEDNICA SPORTOVA godišnje ═══
('Detaljna raspodjela sredstava JPS PGŽ 2025','Raspodjela sredstava javnih potreba u sportu PGŽ za 2025. godinu po klubovima i savezima.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2025',['raspodjela','PGŽ','klub','savez','2025'],'2025-01-01'),
('Detaljna raspodjela sredstava JPS PGŽ 2024','Raspodjela sredstava javnih potreba u sportu PGŽ za 2024. godinu.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2024',['raspodjela','PGŽ','2024'],'2024-01-01'),
('Detaljna raspodjela sredstava JPS PGŽ 2023','Raspodjela sredstava 2023.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2023',['raspodjela','PGŽ','2023'],'2023-01-01'),
('Detaljna raspodjela sredstava JPS PGŽ 2022','Raspodjela sredstava 2022.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2022',['raspodjela','PGŽ','2022'],'2022-01-01'),
('Financijski plan ZS PGŽ 2025','Godišnji financijski plan Zajednice sportova PGŽ za 2025.','plan','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/plan-2025',['plan','financije','ZS PGŽ','2025'],'2025-01-01'),
('Financijski izvještaj ZS PGŽ 2024','Godišnji izvještaj Zajednice sportova PGŽ za 2024.','izvjestaj','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/izvjestaj-2024',['izvještaj','PGŽ','2024'],'2025-03-01'),
('Financijski izvještaj ZS PGŽ 2023','Izvještaj 2023.','izvjestaj','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/izvjestaj-2023',['izvještaj','PGŽ','2023'],'2024-03-01'),
('Statut Zajednice sportova PGŽ','Temeljni akt ZS PGŽ — članstvo, organi, financiranje, ovlasti.','statut','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/statut',['ZS PGŽ','statut'],'2022-01-01'),
('Pravilnik o radu ZS PGŽ','Unutarnji pravilnik rada — donošenje odluka, sjednice, glasanje.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/pravilnik-rada',['ZS PGŽ','rad'],'2022-01-01'),
('Poslovnik Skupštine ZS PGŽ','Pravila rada Skupštine — kvorum, glasanje, dnevni red.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/poslovnik',['ZS PGŽ','poslovnik','skupština'],'2022-01-01'),
# ═══ KRITERIJI VREDNOVANJA ═══
('Kriteriji za vrednovanje sportske kvalitete u PGŽ','Bodovni sustav za rangiranje klubova/saveza po sportskim rezultatima.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/kriteriji',['kriteriji','vrednovanje','rang'],'2023-01-01'),
('Kriteriji za nositelje kvalitete u sportu','Definicija statusa nositelja kvalitete — viši stupanj sufinanciranja.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/nositelji',['nositelj kvalitete','sufinanciranje'],'2023-01-01'),
# ═══ HOO DODATNO ═══
('Pravilnik o registraciji nacionalnih saveza HOO','Postupak učlanjenja nacionalnog saveza u HOO.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/registracija',['HOO','registracija','nacionalni savez'],'2022-01-01'),
('Pravilnik o discipliniranju u HOO sustavu','Disciplinske mjere unutar HOO-a i nacionalnih saveza.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/disciplina',['HOO','disciplina'],'2022-01-01'),
('Pravilnik o medijima i komunikacijama HOO','Pravila objave informacija, akreditacije za medije.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/mediji',['HOO','mediji','akreditacija'],'2021-01-01'),
('Pravilnik o sportskoj znanosti i istraživanjima HOO','Suradnja sa sveučilištima i znanstvenim institucijama.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/znanost',['HOO','znanost','istraživanje'],'2021-01-01'),
# ═══ MTS dodatno ═══
('Pravilnik o licenciranju trenera u sportu','Sustav licenciranja stručnog kadra — kategorije A/B/C, kontinuirano obrazovanje.','pravilnik','RH','MTS',None,'NN 89/23 i izmjene','https://mtus.gov.hr/dokumenti/sport',['licenca','trener','MTS'],'2023-08-04'),
('Pravilnik o sigurnosti na sportskim događanjima','Sigurnosne mjere, redari, zaštitari, video nadzor.','pravilnik','RH','MUP + MTS',None,'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19','https://mup.gov.hr/sigurnost-na-sportu',['sigurnost','redari','navijači','MUP'],'2003-07-15'),
('Zakon o sprječavanju nereda na sportskim natjecanjima','Pravna osnova za sigurnosne mjere na utakmicama.','zakon','RH','Sabor RH',None,'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19','https://www.zakon.hr/z/345/Zakon-o-spre%C4%8Davanju-nereda-na-sportskim-natjecanjima',['neredi','navijači','sigurnost'],'2003-07-15'),
('Zakon o priznavanju i vrednovanju inozemnih obrazovnih kvalifikacija — sport','Priznavanje stranih trenerskih licenci.','zakon','RH','MZO',None,'NN 69/22','https://www.zakon.hr/z/2856/Zakon-o-priznavanju-i-vrednovanju-inozemnih-obrazovnih-kvalifikacija',['priznavanje','strana licenca'],'2022-06-15'),
('Pravilnik o uvjetima za obavljanje djelatnosti sportskog turizma','Sportski kampovi, škole, treninzi za goste.','pravilnik','RH','MTS',None,None,'https://mtus.gov.hr/sportski-turizam',['sportski turizam','kamp'],'2023-01-01'),
# ═══ HNS DODATNO ═══
('Pravilnik o nogometnim sucima HNS','Licenciranje sudaca, dobne kategorije sudaca, ocjenjivanje.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','suci','licenca'],'2024-01-01'),
('Pravilnik o klupskoj licenciranju HNS','UEFA i HNS klupska licenca — financijski, infrastrukturni, sportski kriteriji.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','licenca','UEFA','klub'],'2024-01-01'),
('Pravilnik o transferu i obeštećenju HNS','Transferi između HNS klubova, obeštećenje za razvoj mladih.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','transfer','obeštećenje','razvoj'],'2024-01-01'),
('Pravilnik o trenerima HNS','Licenciranje nogometnih trenera — UEFA Pro/A/B/C kategorije.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','trener','UEFA Pro','UEFA A'],'2024-01-01'),
('Pravilnik o ženskom nogometu HNS','Posebni propisi za žensku nogometnu hijerarhiju.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','ženski nogomet'],'2023-01-01'),
('Pravilnik o malom nogometu HNS','Futsal natjecanja i pravila u HNS sustavu.','pravilnik_savez','Savez','HNS','futsal',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','futsal','mali nogomet'],'2023-01-01'),
# ═══ HRS DODATNO ═══
('Pravilnik o klupskom licenciranju HRS','Klupska licenca u rukometu — sportski, financijski, infrastrukturni kriteriji.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','licenca','klub'],'2024-01-01'),
('Pravilnik o sucima HRS','Licenciranje rukometnih sudaca.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','suci','licenca'],'2024-01-01'),
('Pravilnik o trenerima HRS','Licenciranje rukometnih trenera.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','trener','licenca'],'2024-01-01'),
# ═══ HKS DODATNO ═══
('Pravilnik o sucima HKS','Košarkaški suci — licenciranje, ocjenjivanje, kategorizacija.','pravilnik_savez','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS','suci','FIBA'],'2024-01-01'),
('Pravilnik o trenerima HKS','Licenciranje košarkaških trenera — FIBA i HKS standardi.','pravilnik_savez','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS','trener','FIBA'],'2024-01-01'),
# ═══ HOS Odbojka dodatno ═══
('Pravilnik o registraciji odbojkaša HOS','Registracija odbojkaša, dobne kategorije, transferi.','pravilnik_savez','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS','registracija'],'2024-01-01'),
('Pravilnik o sucima HOS','Odbojkaški suci.','pravilnik_savez','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS','suci'],'2024-01-01'),
# ═══ HŠS Šah dodatno ═══
('Pravilnik o registraciji igrača HŠS','Registracija šahista, FIDE rating, transferi.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','registracija','FIDE'],'2024-01-01'),
('Pravilnik o sucima HŠS','Licenciranje šahovskih sudaca.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','suci'],'2024-01-01'),
# ═══ EU REGULATIVE ═══
('GDPR (Uredba EU 2016/679)','Opća uredba o zaštiti osobnih podataka — primjenjuje se na sportske klubove (članovi, sportaši).','zakon','EU','EU Komisija',None,'OJ L 119, 4.5.2016','https://gdpr-info.eu/',['GDPR','zaštita podataka','EU','privatnost'],'2016-04-27'),
('Zakon o provedbi Opće uredbe o zaštiti podataka','HR implementacija GDPR.','zakon','RH','Sabor RH',None,'NN 42/18','https://www.zakon.hr/z/941/Zakon-o-provedbi-Op%C4%87e-uredbe-o-za%C5%A1titi-podataka',['GDPR','privatnost','HR'],'2018-04-25'),
('European Charter on Sport for All','CoE povelja o sportu za sve.','pravilnik','EU','Council of Europe',None,None,'https://rm.coe.int/european-charter-on-sport-for-all',['CoE','sport za sve'],'1976-09-24'),
('UNESCO International Charter of Physical Education and Sport','UNESCO međunarodna povelja.','pravilnik','EU','UNESCO',None,None,'https://en.unesco.org/charter-of-physical-education-and-sport',['UNESCO','tjelesni odgoj'],'2015-11-17'),
('Erasmus+ Sport Programme Guide','Vodič za EU programe sufinanciranja sporta — small partnerships, cooperation partnerships.','program','EU','Europska komisija',None,None,'https://erasmus-plus.ec.europa.eu/programme-guide',['Erasmus+','EU','suradnja','grant'],'2024-01-01'),
('Council Conclusions on EU Work Plan for Sport 2024-2027','Plan rada EU za sport.','strategija','EU','Vijeće EU',None,None,'https://www.consilium.europa.eu/sport',['EU','plan rada','2024-2027'],'2023-12-01'),
# ═══ INFRASTRUKTURNI / OBJEKTI ═══
('Pravilnik o sportskim objektima','Tehnički standardi za sportske objekte — dvorane, igrališta, bazeni.','pravilnik','RH','MTS',None,None,'https://mtus.gov.hr/sport-objekti',['objekt','dvorana','bazen','tehnički'],'2023-01-01'),
('Pravilnik o sigurnosti na bazenima','Sigurnosni standardi za bazene i plivačka natjecanja.','pravilnik','RH','MTS + MZ',None,None,'https://mtus.gov.hr/bazeni-sigurnost',['bazen','sigurnost','plivanje'],'2022-01-01'),
# ═══ ŠKOLSKI SPORT DODATNO ═══
('Nacionalni kurikulum tjelesne i zdravstvene kulture','Kurikulum predmeta TZK u OŠ i SŠ.','program','RH','MZO',None,'NN 7/19','https://mzo.gov.hr/UserDocsImages/dokumenti/Obrazovanje/NacionalniKurikulum/predmetni-kurikulumi/Tjelesna%20i%20zdravstvena%20kultura.pdf',['kurikulum','TZK','škola','MZO'],'2019-01-22'),
('Pravilnik o organizaciji i provedbi školskih sportskih natjecanja','ŠŠD natjecanja, županijska, državna razina.','pravilnik','RH','MZO',None,None,'https://mzo.gov.hr/sport/skolska-natjecanja',['ŠŠD','natjecanje','MZO','škola'],'2022-01-01'),
('Sustav školskih sportskih društava (SŠSD)','Organizacija školskih klubova u OŠ i SŠ.','program','RH','MZO + Hrvatski školski sportski savez',None,None,'https://hssd.hr/dokumenti',['ŠŠD','HŠŠS','organizacija'],'2023-01-01'),
# ═══ MEDICINA SPORTA ═══
('Pravilnik o sportskoj medicinskoj zaštiti','Liječnička zaštita sportaša — specijalisti, prevencija ozljeda.','pravilnik','RH','MZ + HASMS',None,None,'https://hasms.hr/dokumenti/',['medicina sporta','liječnik','HASMS','prevencija'],'2023-01-01'),
('Pravilnik o sportskoj prehrani i suplementaciji','Smjernice za prehranu i dopuste u suplementaciji.','pravilnik','RH','HASMS',None,None,'https://hasms.hr/dokumenti/',['prehrana','suplementi','HASMS'],'2022-01-01'),
# ═══ FINANCIJE / RAČUNOVODSTVO ═══
('Pravilnik o neprofitnom računovodstvu','Računovodstvene obveze sportskih udruga.','pravilnik','RH','Min. financija',None,'NN 1/15, 25/17, 96/18, 103/18, 81/19','https://www.zakon.hr/z/810/Pravilnik-o-neprofitnom-ra%C4%8Dunovodstvu-i-ra%C4%8Dunskom-planu',['neprofitno','računovodstvo','udruga'],'2015-01-08'),
('Zakon o financijskom poslovanju i računovodstvu neprofitnih organizacija','Temeljni propis za neprofit.','zakon','RH','Sabor RH',None,'NN 121/14','https://www.zakon.hr/z/672/Zakon-o-financijskom-poslovanju-i-ra%C4%8Dunovodstvu-neprofitnih-organizacija',['neprofit','financije','udruga'],'2014-10-17'),
# ═══ JAVNA NABAVA SPORT ═══
('Zakon o javnoj nabavi','Postupci javne nabave za sportske objekte i opremu.','zakon','RH','Sabor RH',None,'NN 120/16, 114/22','https://www.zakon.hr/z/223/Zakon-o-javnoj-nabavi',['javna nabava','postupak','sport'],'2016-12-29'),
# ═══ SPECIFIČNI SPORTOVI dodatno ═══
('Pravila igre Vaterpolo (FINA Rules)','FINA pravila vaterpola.','pravilnik','EU','World Aquatics',None,None,'https://www.worldaquatics.com/rules/water-polo',['FINA','vaterpolo','pravila'],'2023-01-01'),
('Pravila plivanja FINA','FINA pravila plivanja.','pravilnik','EU','World Aquatics',None,None,'https://www.worldaquatics.com/rules/swimming',['FINA','plivanje'],'2023-01-01'),
('Pravila atletike (World Athletics)','WA tehnička pravila atletike.','pravilnik','EU','World Athletics',None,None,'https://worldathletics.org/about-iaaf/documents/book-of-rules',['World Athletics','atletika'],'2024-01-01'),
('Pravila tenisa (ITF Rules of Tennis)','ITF Rules of Tennis.','pravilnik','EU','ITF','tenis',None,'https://www.itftennis.com/en/about-us/governance/rules-and-regulations/',['ITF','tenis'],'2024-01-01'),
('Pravila stolnog tenisa (ITTF)','ITTF Handbook.','pravilnik','EU','ITTF','stolni tenis',None,'https://www.ittf.com/handbook/',['ITTF','stolni tenis'],'2024-01-01'),
('Pravila biciklizma (UCI)','UCI Cycling Regulations.','pravilnik','EU','UCI','biciklizam',None,'https://www.uci.org/inside-uci/constitutions-regulations/regulations',['UCI','biciklizam'],'2024-01-01'),
('Pravila judoa (IJF)','IJF Rules.','pravilnik','EU','IJF','judo',None,'https://www.ijf.org/ijf/documents',['IJF','judo'],'2023-01-01'),
('Pravila taekwondoa (WT)','World Taekwondo Competition Rules.','pravilnik','EU','World Taekwondo','taekwondo',None,'https://www.worldtaekwondo.org/rules/',['WT','taekwondo'],'2024-01-01'),
('Pravila karatea (WKF)','WKF Kumite/Kata pravila.','pravilnik','EU','WKF','karate',None,'https://www.wkf.net/structure/wkf-rules',['WKF','karate'],'2024-01-01'),
('Pravila boćanja CBI/CMSB','Confederazione Boccistica Internazionale.','pravilnik','EU','CBI','boćanje',None,'https://www.cbi-bocce.com/regolamenti',['CBI','boćanje','volo','raffa'],'2023-01-01'),
('Pravila jedrenja (RRS World Sailing 2025-2028)','Racing Rules of Sailing.','pravilnik','EU','World Sailing','jedriličarstvo',None,'https://www.sailing.org/rrs',['RRS','jedrenje','World Sailing'],'2024-12-01'),
('Pravila veslanja (FISA/World Rowing)','FISA pravila regata.','pravilnik','EU','World Rowing','veslanje',None,'https://worldrowing.com/about/world-rowing/rules-of-racing',['World Rowing','FISA','veslanje'],'2023-01-01'),
('Pravila šaha (FIDE Laws of Chess)','FIDE Handbook.','pravilnik','EU','FIDE','šah',None,'https://handbook.fide.com/',['FIDE','šah'],'2023-07-01'),
# ═══ KLUBSKE LICENCE / TURNIRI ═══
('Pravilnik o organizaciji međunarodnih sportskih natjecanja u RH','Postupak organiziranja međunarodnih turnira u RH (HOO suglasnost, sigurnost).','pravilnik','RH','MTS + HOO',None,None,'https://mtus.gov.hr/medunarodna-natjecanja',['međunarodno','turnir','organizacija'],'2023-01-01'),
# ═══ POSEBNO PGŽ + GRAD RIJEKA NOVIJI ═══
('Proračun PGŽ — sport 2026','Proračunska sredstva za sport u PGŽ proračunu 2026.','plan','PGZ','PGŽ',None,None,'https://www.pgz.hr/proracun-2026',['proračun','PGŽ','2026','sport'],'2025-12-01'),
('Proračun Grada Rijeke — sport 2026','Proračunska sredstva za sport u Rijeci 2026.','plan','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/proracun-2026',['proračun','Rijeka','2026'],'2025-12-01'),
('Plan razvoja Grada Rijeke 2024-2030','Gradski razvojni plan — sport kao strateška mjera.','strategija','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/plan-razvoja',['Rijeka','razvoj','2024-2030'],'2024-01-01'),
('Strategija "Sport za sve" PGŽ','Program rekreativnog sporta i tjelesne aktivnosti građana.','strategija','PGZ','PGŽ',None,None,'https://www.pgz.hr/sport-za-sve',['rekreacija','sport za sve','PGŽ'],'2023-01-01'),
('Pravilnik o radu Sportskih škola PGŽ','Organizacija škola sporta za djecu — sufinanciranje.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/skole-sporta',['škola sporta','djeca','PGŽ'],'2023-01-01'),
('Akcijski plan promidžbe sporta PGŽ','Marketing i promocija sporta u županiji.','program','PGZ','PGŽ',None,None,'https://www.pgz.hr/promidzba-sport',['promidžba','marketing','PGŽ'],'2023-01-01'),
# ═══ ZAŠTITA ═══
('Zakon o zaštiti od nasilja u obitelji — sport','Sportski klubovi kao mjesto prijave nasilja u obitelji.','zakon','RH','Sabor RH',None,'NN 70/17, 126/19, 84/21','https://www.zakon.hr/z/977/Zakon-o-za%C5%A1titi-od-nasilja-u-obitelji',['nasilje','zaštita','obitelj'],'2017-07-21'),
('Sportski etički kodeks RH','Nacionalni etički kodeks — sport bez korupcije, fair play.','pravilnik','RH','HOO + MTS',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/etika',['etika','fair play','korupcija'],'2022-01-01'),
# ═══ EDUKACIJA ═══
('Erasmus+ Sport — Hrvatska iskustva','Sažetak iskusnih projekta iz HR (Erasmus Sport).','erasmus','EU','Europska komisija',None,None,'https://erasmus-plus.ec.europa.eu/sport',['Erasmus+','Hrvatska','EU'],'2024-01-01'),
('Akademski sport u RH (HASS)','Pravilnik Hrvatskog akademskog sportskog saveza.','pravilnik','RH','HASS',None,None,'https://hass.hr/dokumenti',['akademski','HASS','sveučilište'],'2022-01-01'),
('Pravilnik o studentskim sportskim natjecanjima','Univerzitetska natjecanja.','pravilnik','RH','HASS',None,None,'https://hass.hr/natjecanja',['studenti','natjecanje','HASS'],'2022-01-01'),
]
def main():
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
cu.execute("SELECT LOWER(COALESCE(title, '')) FROM pgz_sport.dokumenti")
existing = set(r[0] for r in cu.fetchall())
n_added = 0; n_skip = 0
for d in DOCS:
if d[0].lower() in existing:
n_skip += 1; continue
try:
cu.execute('''INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik,
izvor_url, kljucne_rijeci, izdano_datum, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)''', d)
n_added += 1
existing.add(d[0].lower())
except Exception as e:
print(f' err {d[0][:40]}: {e}')
print(f'Added: {n_added}, Skipped: {n_skip}')
cu.execute('SELECT count(*) FROM pgz_sport.dokumenti')
print(f'TOTAL: {cu.fetchone()[0]}')
conn.close()
if __name__ == '__main__':
main()
+125
View File
@@ -0,0 +1,125 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: klub_oib_enricher.py | v1.0.0 | 04.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scrapers/klub_oib_enricher.py
# Svrha: Enrichment OIB-a za pgz_sport.klubovi (678 BEZ OIB)
# Strategy: 1) match s civic.entities by naziv → kopiraj OIB
# 2) DDG search za kluba s "OIB" i extract
# 3) Sudreg API lookup po naziv (cache)
# ═══════════════════════════════════════════════════════════════════
"""OIB enrichment za PGŽ Sport klubove."""
import os, sys, time, re, hashlib, logging, json
import psycopg2
from psycopg2.extras import RealDictCursor
logging.basicConfig(level=logging.INFO, format='%(asctime)s [klub_oib] %(message)s')
log = logging.getLogger("klub_oib")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def db():
return psycopg2.connect(DSN, cursor_factory=RealDictCursor)
def normalize_name(n):
"""Normalize club name for matching."""
if not n: return ""
n = n.lower().strip()
n = re.sub(r'\s+', ' ', n)
# remove common prefixes/suffixes
n = re.sub(r'\b(klub|udruga|sportski|nk|hk|kk|bk|ok|rk|jk|šk|sk|tk)\b', '', n)
return n.strip()
def match_civic_entities(cur, naziv):
"""Try to match by name in civic.entities."""
if not naziv or len(naziv) < 5: return None
norm = normalize_name(naziv)
if not norm: return None
# Try exact match prvo
cur.execute("""
SELECT oib, name AS naziv, county AS county_code FROM civic.entities
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) ILIKE %s
LIMIT 5
""", (f'%{naziv[:30].lower()}%',))
candidates = cur.fetchall()
if not candidates:
# Try fuzzy
words = [w for w in norm.split() if len(w) > 3][:3]
if words:
for word in words:
cur.execute("""
SELECT oib, name AS naziv FROM civic.entities
WHERE oib IS NOT NULL AND length(oib) = 11
AND lower(name) ILIKE %s
LIMIT 3
""", (f'%{word}%',))
c = cur.fetchall()
if c:
candidates = c
break
if not candidates: return None
# Score candidates by name similarity
best = None
best_score = 0
naziv_norm = normalize_name(naziv)
for c in candidates:
cand_norm = normalize_name(c['naziv'])
if not cand_norm: continue
# Simple word overlap
w1 = set(naziv_norm.split())
w2 = set(cand_norm.split())
if not w1 or not w2: continue
overlap = len(w1 & w2) / max(len(w1), len(w2))
if overlap > best_score and overlap >= 0.6:
best_score = overlap
best = c
return best
def enrich():
conn = db()
conn.autocommit = True
cur = conn.cursor()
# Get klubovi without OIB
cur.execute("""
SELECT id, naziv, savez_id, sport, grad
FROM pgz_sport.klubovi
WHERE (oib IS NULL OR oib = '' OR length(oib) != 11)
ORDER BY id
""")
klubovi = cur.fetchall()
log.info(f"Klubovi za enrichment: {len(klubovi)}")
enriched = 0
no_match = 0
for k in klubovi:
match = match_civic_entities(cur, k['naziv'])
if match:
cur.execute("""
UPDATE pgz_sport.klubovi
SET oib = %s, updated_at = now()
WHERE id = %s AND (oib IS NULL OR oib = '')
""", (match['oib'], k['id']))
log.info(f"{k['naziv'][:40]} → OIB {match['oib']} (matched: {match['naziv'][:40]})")
enriched += 1
else:
no_match += 1
if (enriched + no_match) % 50 == 0:
log.info(f"Progress: {enriched} enriched / {no_match} no_match / {enriched+no_match}/{len(klubovi)}")
log.info(f"FINAL: {enriched} enriched, {no_match} no_match")
cur.close()
conn.close()
return enriched
if __name__ == "__main__":
enrich()
+203
View File
@@ -0,0 +1,203 @@
#!/usr/bin/env python3
"""LLM mining v6 - multi-godina, persist to clan_nagrada, name validation."""
import json, re, psycopg2, requests, time, os, sys
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
OLLAMA = "http://localhost:11434/api/generate"
MODEL = "qwen2.5:7b"
PROMPT_TPL = """Iz teksta godišnjaka sporta ekstraktiraj sve KONKRETNE rezultate sportaša. Vrati ISKLJUČIVO valjani JSON niz.
Format jednog objekta:
{"sportas":"Ime Prezime","klub":"AK Kvarner","sport":"atletika","godina":2024,"natjecanje":"PH","disciplina":"100m","plasman":1,"medalja":"zlato"}
KAKO ČITATI:
- "1. mjesto" → plasman=1, medalja="zlato"
- "2. mjesto" → plasman=2, medalja="srebro"
- "3. mjesto" → plasman=3, medalja="bronca"
- "4./5./6./7./8. mjesto" → plasman=N, medalja=null
- Ako tekst eksplicitno kaže "zlato/srebro/bronca", uzmi to
PRAVILA:
- Vrati [] ako nema konkretnih rezultata
- Sport mora biti pravi naziv: atletika, plivanje, košarka, rukomet, vaterpolo, nogomet, jedrenje, biciklizam, šah, taekwondo, karate, gimnastika, tenis, judo, streljaštvo, boćanje, kuglanje, veslanje, stolni tenis, plesovi itd.
- "klub" - ekstraktiraj iz teksta ako je naveden
- BEZ markdowna, BEZ ```, samo JSON niz
- Ne izmišljaj imena - ako nije sigurno, preskoči
TEKST:
___TXT___"""
KW = re.compile(r'(zlat|srebr|bronc|prvenstv|prvak|EP\b|SP\b|olimp|medalj|svjetsk|europsk|\d\.\s*mjest)', re.IGNORECASE)
NAMES_RE = re.compile(r'\b([A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+\s+[A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+)\b')
# Pre-filter junk - non-name patterns that LLM can pick up
NAME_BLACKLIST = re.compile(r'^(Sport Psychology|Endurance|HNS|HOO|HRS|HKS|MSO|HSS|SSP|TVS|HTS|RNK|ZSU|DVD|Liga HSS|HRP|Vikend|Posto|Visin|Razin|Klub|Sportski|Nogomet|Adresa|Email|Web|Tel|Mob|Fax)', re.IGNORECASE)
def find_chunks(sadrzaj, chunk_size=2200, overlap=200):
chunks = []
i = 0
while i < len(sadrzaj):
chunk = sadrzaj[i:i+chunk_size]
if KW.search(chunk) and len(NAMES_RE.findall(chunk)) >= 2:
chunks.append((i, chunk))
i += chunk_size - overlap
return chunks
def call_llm(text):
try:
rsp = requests.post(OLLAMA, json={
"model": MODEL,
"prompt": PROMPT_TPL.replace("___TXT___", text),
"stream": False,
"options": {"temperature": 0.0, "num_predict": 3000, "num_ctx": 4096}
}, timeout=180)
if rsp.status_code != 200: return None
out = rsp.json().get('response', '').strip()
out = re.sub(r'^```(?:json)?\s*', '', out)
out = re.sub(r'\s*```$', '', out)
m = re.search(r'\[\s*[\{\]]', out)
if not m: return []
t = out[m.start():]
try: return json.loads(t)
except json.JSONDecodeError:
for end in range(len(t)-1, max(len(t)-3000, 0), -50):
if t[end] == ']':
try: return json.loads(t[:end+1])
except: pass
return []
except Exception:
return None
def insert_fact(f, godina_godisnjaka, doc_id):
"""Insert into clan_nagrada with dedup."""
sportas = f.get('sportas','').strip()
if not sportas or len(sportas) < 4: return None
if NAME_BLACKLIST.match(sportas): return 'blacklisted'
if not f.get('plasman'): return None
plasman = f.get('plasman')
medalja = f.get('medalja')
if plasman not in (1,2,3,4,5,6,7,8): return None
# Auto-fix medalja
if plasman == 1 and not medalja: medalja = 'zlato'
elif plasman == 2 and not medalja: medalja = 'srebro'
elif plasman == 3 and not medalja: medalja = 'bronca'
god = f.get('godina') or godina_godisnjaka
natj = (f.get('natjecanje') or 'unknown')[:200]
disc = (f.get('disciplina') or 'unknown')[:200]
# Try to match clan_id by ime+prezime
parts = sportas.split(None, 1)
clan_id = None
if len(parts) == 2:
cu.execute("""SELECT id FROM pgz_sport.clanovi
WHERE LOWER(ime||' '||prezime) = LOWER(%s)
OR LOWER(prezime||' '||ime) = LOWER(%s)
LIMIT 1""", (sportas, sportas))
rec = cu.fetchone()
if rec: clan_id = rec[0]
# Try to match klub_id
klub_naziv = (f.get('klub') or '').strip()
klub_id = None
if klub_naziv and len(klub_naziv) >= 4:
cu.execute("""SELECT id FROM pgz_sport.klubovi
WHERE LOWER(naziv) = LOWER(%s)
OR LOWER(naziv) LIKE LOWER(%s)
ORDER BY
CASE WHEN aktivan THEN 0 ELSE 1 END,
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
id ASC LIMIT 1""",
(klub_naziv, f"%{klub_naziv}%"))
rec = cu.fetchone()
if rec: klub_id = rec[0]
try:
cu.execute("""INSERT INTO pgz_sport.clan_nagrada
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
disciplina, plasman, medalja, source, source_url, last_updated)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'llm_godisnjak', %s, now())
ON CONFLICT ON CONSTRAINT clan_nagrada_uniq DO UPDATE SET
clan_id = COALESCE(pgz_sport.clan_nagrada.clan_id, EXCLUDED.clan_id),
klub_id = COALESCE(pgz_sport.clan_nagrada.klub_id, EXCLUDED.klub_id),
last_updated = now()""",
(clan_id, sportas, klub_id, klub_naziv or None, god, natj, disc, plasman, medalja,
f"dokument:{doc_id}"))
return 'inserted'
except Exception as e:
return f'err:{e}'
GODINA_LIST = [int(x) for x in (sys.argv[1].split(',') if len(sys.argv) > 1 else ['2024'])]
LIMIT = int(os.environ.get('LLM_LIMIT', 50))
total_inserted = 0
total_blacklisted = 0
total_facts = 0
t_global = time.time()
for godina in GODINA_LIST:
cu.execute("SELECT id, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina=%s LIMIT 1", (godina,))
rec = cu.fetchone()
if not rec:
print(f"=== Nema {godina} ==="); continue
did, sadrzaj = rec
print(f"\n=== Godišnjak {godina} (id={did}, {len(sadrzaj):,} chars) ===")
chunks = find_chunks(sadrzaj)
print(f"Chunks: {len(chunks)}, processing {min(LIMIT, len(chunks))}")
god_inserted = 0
god_facts = 0
t_god = time.time()
for idx, (off, chunk) in enumerate(chunks[:LIMIT]):
t0 = time.time()
facts = call_llm(chunk)
el = time.time() - t0
if facts is None:
print(f" [{idx+1}/{LIMIT}] ERR ({el:.1f}s)", flush=True)
continue
if not facts: continue
for f in facts:
if not isinstance(f, dict): continue
res = insert_fact(f, godina, did)
god_facts += 1
if res == 'inserted': god_inserted += 1
elif res == 'blacklisted': total_blacklisted += 1
if facts:
print(f" [{idx+1}/{LIMIT}] {el:.1f}s {len(facts)} parsed", flush=True)
print(f"\n=== {godina}: {god_facts} facts, {god_inserted} inserted, {time.time()-t_god:.0f}s ===")
total_facts += god_facts
total_inserted += god_inserted
print(f"\n=== TOTAL ({time.time()-t_global:.0f}s): {total_facts} parsed, {total_inserted} inserted, {total_blacklisted} blacklisted ===")
# Stats
cu.execute("""SELECT godina, count(*) FROM pgz_sport.clan_nagrada
WHERE source='llm_godisnjak' GROUP BY godina ORDER BY godina""")
print("\n=== LLM clan_nagrada by year ===")
for r in cu.fetchall():
print(f" {r[0]}: {r[1]}")
cu.execute("""SELECT count(*) FILTER (WHERE clan_id IS NOT NULL),
count(*) FILTER (WHERE clan_id IS NULL),
count(*) FILTER (WHERE klub_id IS NOT NULL),
count(*)
FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak'""")
r = cu.fetchone()
print(f"\nLinking stats: {r[0]} clan_id linked, {r[1]} unlinked, {r[2]} klub_id linked of {r[3]} total")
# Audit
cu.execute("""INSERT INTO pgz_sport.audit_feed
(table_name, action, source, source_url, details)
VALUES ('clan_nagrada', 'llm_mining_v6_db', 'qwen2.5:7b', NULL, %s::jsonb)""",
(json.dumps({"godine": GODINA_LIST, "facts": total_facts, "inserted": total_inserted, "chunk_limit": LIMIT}),))
+7
View File
@@ -0,0 +1,7 @@
#!/bin/bash
while true; do
python3 /opt/pgz-sport/scrapers/klub_oib_enricher.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/clan_oib_enricher.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/sudreg_klub_search.py 2>&1 | tail -2
sleep 600
done
+8
View File
@@ -0,0 +1,8 @@
#!/bin/bash
while true; do
echo "[$(date)] PGŽ deep cycle"
python3 /opt/pgz-sport/scrapers/pgz_sport_deep.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/rijeka_sport_scraper.py 2>&1 | tail -2
python3 /opt/pgz-sport/scrapers/rss_rijeka_scraper.py 2>&1 | tail -2
sleep 1800
done
+139
View File
@@ -0,0 +1,139 @@
#!/usr/bin/env python3
# pgz_sport_deep.py — Deep scrape sport-pgz.hr + pgz.hr/sport
import os, sys, time, hashlib, logging, re
from urllib.parse import urljoin, urlparse
import urllib.request
import psycopg2
from html import unescape
logging.basicConfig(level=logging.INFO, format='%(asctime)s [pgz_deep] %(message)s')
log = logging.getLogger("pgz_deep")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
ROOTS = [
"https://sport-pgz.hr",
"https://www.pgz.hr/teme/sport/",
"https://www.pgz.hr/sport/",
"https://www.pgz.hr/o-zupaniji/upravna-tijela/upravni-odjel-za-kulturu-sport-tehnicku-kulturu/",
]
def fetch(url, retries=3):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode('utf-8', errors='replace'), r.status
except Exception as e:
time.sleep(3*(i+1))
return None, 0
def extract_text(html):
if not html: return ""
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = unescape(text)
return re.sub(r'\s+', ' ', text).strip()
def find_links(html, base):
if not html: return []
out = []
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
u = urljoin(base, m.group(1))
host = urlparse(u).hostname or ""
if any(d in host for d in ['pgz.hr', 'sport-pgz.hr']):
out.append(u)
return list(set(out))
def find_pdf_links(html, base):
if not html: return []
out = []
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
out.append(urljoin(base, m.group(1)))
return list(set(out))
def harvest():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
visited = set()
queue = list(ROOTS)
docs = 0
facts = 0
pdfs_logged = 0
while queue and len(visited) < 300:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url)
if not html or status != 200:
time.sleep(1)
continue
log.info(f"[{status}] {url[:80]} ({len(html)} bytes)")
text = extract_text(html)
if len(text) < 100: continue
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
title = title_m.group(1).strip() if title_m else url[:80]
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
try:
cur.execute("""
INSERT INTO pgz_sport.dokumenti
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
ON CONFLICT DO NOTHING
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'PGŽ'))
docs += cur.rowcount
except Exception as e:
log.warning(f"Doc insert fail: {e}")
# PDF links — log them
for pdf_url in find_pdf_links(html, url):
try:
pdf_sha = hashlib.sha1(pdf_url.encode()).hexdigest()
cur.execute("""
INSERT INTO pgz_sport.dokumenti
(url, pdf_url, fname, title, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
ON CONFLICT DO NOTHING
""", (pdf_url, pdf_url, pdf_url.split('/')[-1][:100],
pdf_url.split('/')[-1][:200], 'pdf', url, pdf_sha, 'PGŽ'))
pdfs_logged += cur.rowcount
except: pass
# Knowledge facts — sport relevant
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'sportaši', 'natjecanj', 'manifestacij', 'javne potrebe', 'sufinancir', 'kup', 'prvenstvo', 'liga', 'utakm', 'igrač', 'trener', 'olimpij', 'paraolimpij', 'turn', 'medalj', 'pobjed', 'gradonaceln', 'župan', 'rijeka', 'pgž', 'primorsko', 'subvenc', 'natječaj', 'odluka', 'proračun', 'rebal']):
# Save chunk as fact
chunks = [text[i:i+800] for i in range(0, min(len(text), 5000), 800)]
for ci, chunk in enumerate(chunks[:5]):
if len(chunk) < 200: continue
fact_hash = hashlib.sha256((url + str(ci) + chunk[:100]).encode()).hexdigest()
try:
cur.execute("""
INSERT INTO dabi.knowledge
(fact, category, source, source_url, source_date, confidence, data_hash)
VALUES (%s, 'pgz_sport_official', 'pgz_sport_deep', %s, CURRENT_DATE, 0.85, %s)
ON CONFLICT (data_hash) DO NOTHING
""", (chunk[:1500].replace('\x00', ''), url, fact_hash))
facts += cur.rowcount
except: pass
# Follow links
links = find_links(html, url)
for l in links[:25]:
if l not in visited and l not in queue:
queue.append(l)
log.info(f"FINAL: visited={len(visited)} docs={docs} pdfs={pdfs_logged} facts={facts}")
cur.close()
conn.close()
if __name__ == "__main__":
harvest()
+121
View File
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
# qa_from_sport_facts.py — Generate Q&A pairs from PGZ sport facts za DABI trening
import psycopg2, hashlib, logging, json, re
logging.basicConfig(level=logging.INFO, format='%(asctime)s [qa_gen] %(message)s')
log = logging.getLogger("qa_gen")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Klubovi facts → Q&A
cur.execute("""
SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, godina_osnutka, broj_clanova
FROM pgz_sport.klubovi WHERE aktivan = true AND naziv IS NOT NULL
LIMIT 1000
""")
qa_rows = []
for r in cur.fetchall():
kid, naziv, oib, sport, grad, preds, tajn, god, n_cl = r
# Generate diverse Q-A pairs
pairs = []
if oib: pairs.append((f"Koji je OIB kluba {naziv}?", f"OIB kluba {naziv} je {oib}."))
if sport: pairs.append((f"Kojim sportom se bavi klub {naziv}?", f"Klub {naziv} bavi se sportom: {sport}."))
if grad: pairs.append((f"U kojem gradu je klub {naziv}?", f"Klub {naziv} djeluje u gradu {grad}."))
if preds: pairs.append((f"Tko je predsjednik kluba {naziv}?", f"Predsjednik kluba {naziv} je {preds}."))
if tajn: pairs.append((f"Tko je tajnik kluba {naziv}?", f"Tajnik kluba {naziv} je {tajn}."))
if god: pairs.append((f"Kada je osnovan klub {naziv}?", f"Klub {naziv} osnovan je {god}. godine."))
if n_cl: pairs.append((f"Koliko članova ima klub {naziv}?", f"Klub {naziv} ima {n_cl} članova."))
for q, a in pairs:
qa_hash = hashlib.sha256(f"{q}".encode()).hexdigest()[:32]
qa_rows.append((q, a, 'pgz_sport_klub_qa', 'auto_generated', 0.92, qa_hash))
# Savezi facts
cur.execute("""
SELECT id, naziv, oib, sport, predsjednik, tajnik FROM pgz_sport.savezi
WHERE naziv IS NOT NULL
""")
for r in cur.fetchall():
sid, naziv, oib, sport, preds, tajn = r
if oib: qa_rows.append((f"Koji je OIB saveza {naziv}?", f"OIB saveza {naziv} je {oib}.",
'pgz_sport_savez_qa', 'auto_generated', 0.92,
hashlib.sha256(f"savez_oib:{sid}".encode()).hexdigest()[:32]))
if preds: qa_rows.append((f"Tko je predsjednik {naziv}?", f"Predsjednik {naziv} je {preds}.",
'pgz_sport_savez_qa', 'auto_generated', 0.92,
hashlib.sha256(f"savez_preds:{sid}".encode()).hexdigest()[:32]))
# Multi-chair pitanja
cur.execute("""
WITH all_links AS (
SELECT lower(trim(predsjednik)) AS pk, predsjednik AS pname,
'klub:'||k.id AS oid, k.naziv AS oname, 'predsjednik' AS role
FROM pgz_sport.klubovi k WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
UNION ALL
SELECT lower(trim(tajnik)), tajnik, 'klub:'||k.id, k.naziv, 'tajnik'
FROM pgz_sport.klubovi k WHERE tajnik IS NOT NULL AND length(trim(tajnik)) > 5
)
SELECT pname, count(DISTINCT oid) AS n,
string_agg(DISTINCT oname, ', ') AS klubovi
FROM all_links GROUP BY pname HAVING count(DISTINCT oid) >= 3
ORDER BY 2 DESC LIMIT 50
""")
for r in cur.fetchall():
pname, n, klubovi = r
q = f"U koliko klubova/saveza je {pname} u funkciji?"
a = f"{pname} sjedi na {n} stolica u PGŽ Sport. Klubovi: {klubovi[:300]}. Ovo je multi-chair pozicija — moguć sukob interesa."
qh = hashlib.sha256(f"mc_qa:{pname}".encode()).hexdigest()[:32]
qa_rows.append((q, a, 'pgz_sport_multichair_qa', 'auto_generated', 0.90, qh))
# Manifestacije Q&A
cur.execute("SELECT id, naziv, mjesto, godina_od, organizator FROM pgz_sport.manifestacije WHERE naziv IS NOT NULL")
for r in cur.fetchall():
mid, naziv, mjesto, god, org = r
if mjesto: qa_rows.append((f"Gdje se održava manifestacija {naziv}?", f"Manifestacija {naziv} održava se u {mjesto}.",
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
hashlib.sha256(f"man_mjesto:{mid}".encode()).hexdigest()[:32]))
if org: qa_rows.append((f"Tko organizira manifestaciju {naziv}?", f"Manifestaciju {naziv} organizira {org}.",
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
hashlib.sha256(f"man_org:{mid}".encode()).hexdigest()[:32]))
if god: qa_rows.append((f"Otkad se održava manifestacija {naziv}?", f"Manifestacija {naziv} održava se od {god}. godine.",
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
hashlib.sha256(f"man_god:{mid}".encode()).hexdigest()[:32]))
# Natjecanja Q&A
cur.execute("SELECT id, naziv, sport, sezona, razina, tip FROM pgz_sport.natjecanja WHERE naziv IS NOT NULL LIMIT 500")
for r in cur.fetchall():
nid, naziv, sport, sez, raz, tip = r
if sport: qa_rows.append((f"Kojim sportom se bavi natjecanje {naziv}?", f"Natjecanje {naziv} je u sportu {sport}.",
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
hashlib.sha256(f"nat_sport:{nid}".encode()).hexdigest()[:32]))
if raz: qa_rows.append((f"Koja je razina natjecanja {naziv}?", f"Natjecanje {naziv} je razine {raz}.",
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
hashlib.sha256(f"nat_raz:{nid}".encode()).hexdigest()[:32]))
if sez: qa_rows.append((f"U kojoj sezoni je natjecanje {naziv}?", f"Natjecanje {naziv} je sezona {sez}.",
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
hashlib.sha256(f"nat_sez:{nid}".encode()).hexdigest()[:32]))
# Save to dabi.training_qa
inserted = 0
for qa in qa_rows:
try:
cur.execute("""
INSERT INTO dabi.training_qa
(question, answer, category, source_type, created_at)
VALUES (%s, %s, %s, %s, now())
ON CONFLICT DO NOTHING
""", (qa[0], qa[1], qa[2], 'pgz_sport_auto'))
inserted += cur.rowcount
except Exception as e:
if inserted < 3:
log.warning(f"insert fail: {e}")
log.info(f"Q&A pairs generated: {len(qa_rows)}, inserted: {inserted}")
cur.close(); conn.close()
if __name__ == "__main__":
main()
+20
View File
@@ -0,0 +1,20 @@
#!/usr/bin/env python3
# rijeka_sport_scraper.py — sport.rijeka.hr + rijeka.hr/sport
import sys, os
sys.path.insert(0, '/opt/pgz-sport/scrapers')
from pgz_sport_deep import harvest as base_harvest, fetch, extract_text, find_links, find_pdf_links
import logging
logging.getLogger().handlers.clear()
logging.basicConfig(level=logging.INFO, format='%(asctime)s [rijeka_sport] %(message)s')
# Override roots
import pgz_sport_deep
pgz_sport_deep.ROOTS = [
"https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/",
"https://www.rijeka.hr/sport/",
"https://sport.rijeka.hr",
"https://www.rijekasport.hr",
]
if __name__ == "__main__":
pgz_sport_deep.harvest()
+145
View File
@@ -0,0 +1,145 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# Fajl: rss_rijeka_scraper.py | v1.0.0 | 04.05.2026
# Autor: Damir Radulić <dradulic@outlook.com>
# Lokacija: /opt/pgz-sport/scrapers/rss_rijeka_scraper.py
# Svrha: RSS / Zajednica sportskih udruga grada Rijeke deep scraper
# Cilj: financijski izvještaji, klubovi, sportaši, dokumenti
# ═══════════════════════════════════════════════════════════════════
"""RSS Rijeka scraper — klubovi, financiranje, dokumenti."""
import os, sys, time, hashlib, logging, re, json
from urllib.parse import urljoin, urlparse
import urllib.request
import psycopg2
from psycopg2.extras import execute_batch
from html import unescape
logging.basicConfig(level=logging.INFO, format='%(asctime)s [rss] %(message)s')
log = logging.getLogger("rss")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Civic Intelligence Bot 1.0; contact: dradulic@outlook.com)"
# Probe potential domains
RSS_DOMAINS = [
"https://rijeckisportskisavez.hr",
"https://www.zsus-rijeka.hr",
"https://sport.rijeka.hr",
"https://rss-rijeka.hr",
"https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/"
]
def fetch(url, retries=3):
for i in range(retries):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
content = r.read().decode('utf-8', errors='replace')
time.sleep(2.0)
return content, r.status
except Exception as e:
log.warning(f"Fetch fail {i+1}: {url} {e}")
time.sleep(3 * (i+1))
return None, 0
def find_links(html, base_url):
"""Extract internal links and PDFs."""
if not html: return []
links = []
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
url = m.group(1)
if url.startswith('#') or url.startswith('mailto:') or url.startswith('javascript:'):
continue
full = urljoin(base_url, url)
try:
host = urlparse(full).hostname or ""
if any(d in host for d in ['rijeckisportskisavez', 'zsus-rijeka', 'rijeka.hr', 'rss-rijeka']):
links.append(full)
except: pass
return list(set(links))
def extract_text(html):
"""Strip HTML tags."""
if not html: return ""
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S | re.I)
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S | re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = unescape(text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_oibs(text):
"""Find OIB numbers in text."""
return re.findall(r'\b(\d{11})\b', text)
def extract_money(text):
"""Find EUR amounts."""
return re.findall(r'(\d{1,3}(?:[.,]\d{3})+(?:[.,]\d{2})?)\s*(?:EUR|€|kn|HRK)', text)
def harvest():
conn = psycopg2.connect(DSN)
conn.autocommit = True
cur = conn.cursor()
visited = set()
queue = list(RSS_DOMAINS)
docs_inserted = 0
facts_inserted = 0
while queue and len(visited) < 200:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
html, status = fetch(url)
if not html or status != 200: continue
log.info(f"[{status}] {url} ({len(html)} bytes)")
text = extract_text(html)
if len(text) < 100: continue
# Insert dokument
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
title = title_m.group(1).strip() if title_m else url[:80]
sha1 = hashlib.sha1(text.encode()).hexdigest()
try:
cur.execute("""
INSERT INTO pgz_sport.dokumenti
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
ON CONFLICT DO NOTHING
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'RSS Rijeka'))
docs_inserted += cur.rowcount
except Exception as e:
log.warning(f"Insert fail: {e}")
# Extract OIBs and create facts
oibs = set(extract_oibs(text))
for oib in oibs:
if not oib.startswith('0000'):
fact = f"OIB {oib} pojavljuje se na RSS Rijeka stranici: {title[:100]}"
fact_hash = hashlib.sha256((url + fact).encode()).hexdigest()
try:
cur.execute("""
INSERT INTO dabi.knowledge
(fact, category, source, source_url, source_date, confidence, data_hash)
VALUES (%s, 'rss_rijeka', 'rss_rijeka_scraper', %s, CURRENT_DATE, 0.75, %s)
ON CONFLICT (data_hash) DO NOTHING
""", (fact[:500], url, fact_hash))
facts_inserted += cur.rowcount
except: pass
# Find more links to follow
links = find_links(html, url)
for link in links[:30]:
if link not in visited and link not in queue:
queue.append(link)
log.info(f"FINAL: visited={len(visited)} docs={docs_inserted} facts={facts_inserted}")
cur.close()
conn.close()
if __name__ == "__main__":
harvest()
+80
View File
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
# Federation deep scrape — HNS, HPS, HRS
import sys
sys.path.insert(0, '/opt/pgz-sport/scrapers')
from gov_hr_sport_scraper import fetch, extract_text, find_links
from urllib.parse import urljoin, urlparse
import time, re, hashlib, json, psycopg2
from html import unescape
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s')
log = logging.getLogger("fed")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
ROOTS = {
"HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"],
"HPS": ["https://www.hps.hr"],
"HRS": ["https://www.hrs.hr"],
"HOK": ["https://www.hok.hr"], # Hrvatski olimpijski komitet
"HKS": ["https://www.hks.hr"], # Hrvatski karatraski savez
}
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
total_docs = total_facts = 0
for fed, roots in ROOTS.items():
log.info(f"=== {fed} deep ===")
visited = set(); queue = list(roots)
while queue and len(visited) < 80:
url = queue.pop(0)
if url in visited: continue
visited.add(url)
time.sleep(2)
html, status = fetch(url)
if not html or status != 200: continue
log.info(f" [{status}] {url[:80]}")
text = extract_text(html)
if len(text) < 200: continue
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
title = title_m.group(1).strip() if title_m else url[:80]
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
try:
cur.execute("""INSERT INTO pgz_sport.dokumenti
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed))
total_docs += cur.rowcount
except: pass
# Facts ako ima sport-relevant
if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']):
chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)]
for ci, chunk in enumerate(chunks[:3]):
if len(chunk) < 200: continue
fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32]
try:
cur.execute("""INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now())
ON CONFLICT (data_hash) DO NOTHING""",
(chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh))
total_facts += cur.rowcount
except: pass
# Follow internal links
host = urlparse(url).hostname
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
u = urljoin(url, m.group(1))
if urlparse(u).hostname == host and u not in visited and u not in queue:
queue.append(u)
if len(queue) > 100: break
log.info(f"TOTAL: docs={total_docs} facts={total_facts}")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()
+11
View File
@@ -0,0 +1,11 @@
#!/bin/bash
# Master loop — PGŽ sport intensive learning every 15min
while true; do
echo "[$(date)] === LOOP START ==="
python3 /opt/pgz-sport/scrapers/klub_oib_enricher.py 2>&1 | tail -3
python3 /opt/pgz-sport/scrapers/clan_oib_enricher.py 2>&1 | tail -3
python3 /opt/pgz-sport/scrapers/sport_to_knowledge.py 2>&1 | tail -5
python3 /opt/pgz-sport/scrapers/qa_from_sport_facts.py 2>&1 | tail -3
echo "[$(date)] === LOOP END, sleep 900s ==="
sleep 900
done
+163
View File
@@ -0,0 +1,163 @@
#!/usr/bin/env python3
# sport_to_knowledge.py v2.0 — match dabi.knowledge schema (source_refs jsonb)
import os, sys, hashlib, logging, json
import psycopg2
from psycopg2.extras import execute_batch
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sport2k] %(message)s')
log = logging.getLogger("sport2k")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def insert_batch(cur, facts):
if not facts: return 0
execute_batch(cur, """
INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s, %s, %s, %s::jsonb, %s, %s, now())
ON CONFLICT (data_hash) DO NOTHING
""", facts, page_size=200)
return len(facts)
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
total = 0
# 1) Klubovi
cur.execute("""
SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, trener_glavni,
broj_clanova, broj_aktivnih_sportasa, godina_osnutka, web, telefon, email, adresa
FROM pgz_sport.klubovi WHERE aktivan = true
""")
rows = []
for k in cur.fetchall():
kid, naziv, oib, sport, grad, preds, tajn, tren, n_cl, n_akt, god, web, tel, email, adr = k
if not naziv: continue
parts = [f"Klub {naziv}"]
if sport: parts.append(f"sport: {sport}")
if grad: parts.append(f"grad: {grad}")
if oib: parts.append(f"OIB: {oib}")
if god: parts.append(f"osnovan {god}.")
if preds: parts.append(f"predsjednik: {preds}")
if tajn: parts.append(f"tajnik: {tajn}")
if tren: parts.append(f"glavni trener: {tren}")
if n_cl: parts.append(f"broj članova: {n_cl}")
if n_akt: parts.append(f"broj aktivnih sportaša: {n_akt}")
if adr: parts.append(f"adresa: {adr}")
if tel: parts.append(f"tel: {tel}")
if email: parts.append(f"email: {email}")
if web: parts.append(f"web: {web}")
fact = ". ".join(parts) + "."
if len(fact) < 30: continue
fact_hash = hashlib.sha256(f"klub:{kid}:{fact[:200]}".encode()).hexdigest()[:32]
refs = json.dumps([{"type":"pgz_sport_klub","id": kid, "url": f"https://sport.rinet.one/admin#klub/{kid}"}])
rows.append((fact[:2000], 'pgz_sport_klub', 'pgz_sport_db_extract', refs, 0.92, fact_hash))
n = insert_batch(cur, rows); total += n; log.info(f"Klubovi facts: {n}")
# 2) Savezi
cur.execute("""
SELECT id, naziv, oib, sport, predsjednik, tajnik, web, NULL AS broj_klubova, NULL AS broj_clanova
FROM pgz_sport.savezi
""")
rows = []
for s in cur.fetchall():
sid, naziv, oib, sport, preds, tajn, web, n_kl, n_cl = s
if not naziv: continue
parts = [f"Savez {naziv}"]
if sport: parts.append(f"sport: {sport}")
if oib: parts.append(f"OIB: {oib}")
if preds: parts.append(f"predsjednik: {preds}")
if tajn: parts.append(f"tajnik: {tajn}")
if n_kl: parts.append(f"broj klubova: {n_kl}")
if n_cl: parts.append(f"broj članova: {n_cl}")
if web: parts.append(f"web: {web}")
fact = ". ".join(parts) + "."
if len(fact) < 30: continue
fact_hash = hashlib.sha256(f"savez:{sid}:{fact[:200]}".encode()).hexdigest()[:32]
refs = json.dumps([{"type":"pgz_sport_savez","id": sid}])
rows.append((fact[:2000], 'pgz_sport_savez', 'pgz_sport_db_extract', refs, 0.92, fact_hash))
n = insert_batch(cur, rows); total += n; log.info(f"Savezi facts: {n}")
# 3) Overview
cur.execute("""
SELECT s.naziv, s.sport, count(k.id) AS n_kl,
string_agg(k.grad, ', ' ORDER BY k.grad) FILTER (WHERE k.grad IS NOT NULL) AS gradovi
FROM pgz_sport.savezi s
LEFT JOIN pgz_sport.klubovi k ON k.savez_id = s.id AND k.aktivan = true
GROUP BY s.id, s.naziv, s.sport HAVING count(k.id) > 0
""")
rows = []
for r in cur.fetchall():
savez, sport, n, gradovi = r
gradovi_str = (gradovi[:300] + '...') if gradovi and len(gradovi) > 300 else (gradovi or '')
fact = f"{savez} ima {n} aktivnih klubova"
if sport: fact += f" u sportu {sport}"
if gradovi_str: fact += f". Gradovi: {gradovi_str[:200]}"
fact += "."
fact_hash = hashlib.sha256(f"overview:{savez}".encode()).hexdigest()[:32]
rows.append((fact[:2000], 'pgz_sport_overview', 'pgz_sport_db_extract', json.dumps([{}]), 0.95, fact_hash))
n = insert_batch(cur, rows); total += n; log.info(f"Overview facts: {n}")
# 4) Multi-chair
cur.execute("""
WITH all_links AS (
SELECT lower(trim(predsjednik)) AS pk, predsjednik AS pname,
'klub:'||k.id AS oid, k.naziv AS oname, 'predsjednik' AS role
FROM pgz_sport.klubovi k WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
UNION ALL
SELECT lower(trim(tajnik)), tajnik, 'klub:'||k.id, k.naziv, 'tajnik'
FROM pgz_sport.klubovi k WHERE tajnik IS NOT NULL AND length(trim(tajnik)) > 5
UNION ALL
SELECT lower(trim(predsjednik)), predsjednik, 'savez:'||s.id, s.naziv, 'predsjednik'
FROM pgz_sport.savezi s WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
)
SELECT pk, max(pname) AS pname, count(DISTINCT oid) AS n_orgs,
string_agg(DISTINCT oname || ' (' || role || ')', '; ') AS orgs
FROM all_links GROUP BY pk HAVING count(DISTINCT oid) >= 2
ORDER BY count(DISTINCT oid) DESC LIMIT 200
""")
rows = []
for r in cur.fetchall():
pk, pname, n_orgs, orgs = r
fact = f"{pname} sjedi na {n_orgs} stolica u PGŽ Sport ekosustavu: {orgs[:500]}"
if n_orgs >= 3:
fact += " — VIŠESTRUKE FUNKCIJE: forenzički flag za moguće sukobe interesa."
fact_hash = hashlib.sha256(f"multichair:{pk}".encode()).hexdigest()[:32]
rows.append((fact[:2000], 'pgz_sport_multichair', 'pgz_sport_db_extract', json.dumps([{}]), 0.90, fact_hash))
n = insert_batch(cur, rows); total += n; log.info(f"Multi-chair facts: {n}")
# 5) Manifestacije + natjecanja
cur.execute("SELECT id, naziv, mjesto, godina_od, organizator, razina, broj_ucesnika FROM pgz_sport.manifestacije WHERE naziv IS NOT NULL")
rows = []
for r in cur.fetchall():
mid, naziv, mjesto, god, org, razina, n_uces = r
fact = f"Sportska manifestacija: {naziv}"
if mjesto: fact += f", mjesto: {mjesto}"
if god: fact += f", godina {god}"
if org: fact += f", organizator: {org}"
if razina: fact += f", razina: {razina}"
if n_uces: fact += f", broj učesnika: {n_uces}"
fact += "."
fh = hashlib.sha256(f"man:{mid}".encode()).hexdigest()[:32]
rows.append((fact[:2000], 'pgz_sport_manifestacija', 'pgz_sport_db_extract', json.dumps([{}]), 0.85, fh))
cur.execute("SELECT id, naziv, sport, datum_pocetka::text AS godina, sezona, razina, tip, kategorija FROM pgz_sport.natjecanja WHERE naziv IS NOT NULL LIMIT 500")
for r in cur.fetchall():
nid, naziv, sport, god, sez, raz, tip, kat = r
fact = f"Natjecanje: {naziv}"
if sport: fact += f" — sport: {sport}"
if sez: fact += f", sezona {sez}"
if raz: fact += f", razina: {raz}"
if tip: fact += f", tip: {tip}"
if kat: fact += f", kategorija: {kat}"
if god and god != 'None': fact += f", datum početka: {god[:10]}"
fact += "."
fh = hashlib.sha256(f"nat:{nid}".encode()).hexdigest()[:32]
rows.append((fact[:2000], 'pgz_sport_natjecanje', 'pgz_sport_db_extract', json.dumps([{}]), 0.85, fh))
n = insert_batch(cur, rows); total += n; log.info(f"Manifest+nat: {n}")
log.info(f"═══ TOTAL: {total} ═══")
cur.close(); conn.close()
if __name__ == "__main__":
main()
+50
View File
@@ -0,0 +1,50 @@
#!/usr/bin/env python3
# Sudreg lookup po klubu nazivu za one BEZ OIB
import psycopg2, time, hashlib, json, re
from psycopg2.extras import RealDictCursor
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
def main():
conn = psycopg2.connect(DSN, cursor_factory=RealDictCursor)
conn.autocommit = True
cur = conn.cursor()
# Klubovi bez OIB sa imenom dovoljnim za search
cur.execute("""
SELECT id, naziv, grad FROM pgz_sport.klubovi
WHERE (oib IS NULL OR length(oib) != 11) AND naziv IS NOT NULL AND length(naziv) > 8
ORDER BY id
""")
klubovi = cur.fetchall()
print(f"Klubovi za Sudreg lookup: {len(klubovi)}")
# Try fuzzy match s civic.sudreg_api_cache koji već imamo
found = 0
for k in klubovi:
# Drop common suffixes
clean = re.sub(r'\b(klub|udruga|sportski|sportsko)\b', '', k['naziv'], flags=re.I).strip()
if len(clean) < 5: continue
# Try direct search civic.sudreg_api_cache
cur.execute("""
SELECT data->>'oib' AS oib, data->>'tvrtka' AS tvrtka, data->>'naziv' AS naziv
FROM civic.sudreg_api_cache
WHERE (data->>'tvrtka' ILIKE %s OR data->>'naziv' ILIKE %s)
AND data->>'oib' IS NOT NULL
LIMIT 3
""", (f'%{clean[:30]}%', f'%{clean[:30]}%'))
cands = cur.fetchall()
if len(cands) == 1 and cands[0]['oib']:
cur.execute("UPDATE pgz_sport.klubovi SET oib=%s WHERE id=%s AND oib IS NULL",
(cands[0]['oib'], k['id']))
found += 1
if found % 20 == 0:
print(f"Found: {found}")
print(f"FINAL: {found} OIB-ova nadeno preko sudreg_api_cache")
cur.close(); conn.close()
if __name__ == "__main__":
main()
+74
View File
@@ -0,0 +1,74 @@
#!/usr/bin/env python3
# sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere
import os, time, hashlib, logging, re, json
from urllib.parse import urljoin
import urllib.request
import psycopg2
from html import unescape
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s')
log = logging.getLogger("sukob")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Mozilla/5.0 (Ri.NET Bot 1.0)"
def fetch(url):
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return r.read().decode('utf-8', errors='replace'), r.status
except: return None, 0
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check
cur.execute("""
SELECT DISTINCT ime FROM (
SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
UNION
SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5
UNION
SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
) t LIMIT 100
""")
sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]]
log.info(f"Sport imena za cross-check: {len(sport_imena)}")
# Search sukobinteresa.hr za neka imena
facts = 0
for ime in sport_imena[:20]:
# Pretraga po imenu
from urllib.parse import quote_plus
url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}"
html, status = fetch(url)
time.sleep(2)
if not html or status != 200: continue
# Ako ima rezultata, izvuci
if ime.lower() in html.lower():
# cleanup html
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\s+', ' ', unescape(text)).strip()
# Find context around ime
idx = text.lower().find(ime.lower())
if idx > 0:
ctx = text[max(0, idx-300):idx+500]
fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}"
fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32]
try:
cur.execute("""INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now())
ON CONFLICT (data_hash) DO NOTHING""",
(fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh))
facts += cur.rowcount
log.info(f"✓ Match: {ime}")
except: pass
log.info(f"FINAL: {facts} sukob facts")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()
+176
View File
@@ -0,0 +1,176 @@
#!/usr/bin/env python3
# ═══════════════════════════════════════════════════════════════════
# wiki_hr_scraper.py | v1.0.0 | 04.05.2026
# Svrha: Hrvatska Wikipedia — extract relevant pages za HR knowledge
# Strategy: API search po HR-relevant kategorijama + fetch top results
# ═══════════════════════════════════════════════════════════════════
"""Hrvatska Wikipedia scraper (preko API)."""
import os, time, hashlib, logging, re, json
import urllib.request, urllib.parse
import psycopg2
import sys
logging.basicConfig(level=logging.INFO, format='%(asctime)s [wiki_hr] %(message)s')
log = logging.getLogger("wiki_hr")
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
UA = "Ri.NET Bot 1.0 (contact: dradulic@outlook.com)"
API = "https://hr.wikipedia.org/w/api.php"
# Kategorije — širok HR knowledge bazu
CATEGORIES = [
"Hrvatski_gradovi",
"Hrvatske_općine",
"Hrvatski_otoci",
"Hrvatske_planine",
"Hrvatske_rijeke",
"Primorsko-goranska_županija",
"Naselja_u_Primorsko-goranskoj_županiji",
"Hrvatski_političari",
"Hrvatski_sportaši",
"Hrvatski_glazbenici",
"Hrvatski_pisci",
"Hrvatski_glumci",
"Hrvatska_povijest",
"Hrvatska_arhitektura",
"Hrvatska_kuhinja",
"Hrvatska_kultura",
"Hrvatska_znanost",
"Domovinski_rat",
"Hrvatska_ekonomija",
"Hrvatski_klubovi",
"Hrvatski_nogometni_klubovi",
"Hrvatski_košarkaški_klubovi",
"Hrvatske_političke_stranke",
"Predsjednici_Hrvatske",
"Premijeri_Hrvatske",
"Rijeka",
"Kvarner",
"Krk",
"Cres",
"Lošinj",
"Rab",
"Pag",
"Učka",
"Risnjak",
]
def api_get(params):
"""Wikipedia API GET."""
p = dict(params)
p['format'] = 'json'
p['utf8'] = '1'
url = API + '?' + urllib.parse.urlencode(p)
try:
req = urllib.request.Request(url, headers={"User-Agent": UA})
with urllib.request.urlopen(req, timeout=20) as r:
return json.loads(r.read().decode('utf-8'))
except Exception as e:
log.warning(f"API fail: {e}")
return {}
def category_members(cat, limit=500):
"""Get all pages in category."""
pages = []
cont = ''
while True:
params = {
'action': 'query',
'list': 'categorymembers',
'cmtitle': f'Kategorija:{cat}',
'cmlimit': '500',
'cmtype': 'page'
}
if cont: params['cmcontinue'] = cont
d = api_get(params)
if not d.get('query'): break
for m in d['query'].get('categorymembers', []):
pages.append(m['title'])
if len(pages) >= limit: return pages
cont = d.get('continue', {}).get('cmcontinue')
if not cont: break
time.sleep(0.5)
return pages
def fetch_page_extract(title):
"""Get plain text extract of a page."""
params = {
'action': 'query',
'prop': 'extracts|info',
'exintro': '0',
'explaintext': '1',
'inprop': 'url',
'titles': title,
'exsectionformat': 'plain',
'exlimit': '1',
}
d = api_get(params)
if not d.get('query'): return None, None
pages = d['query'].get('pages', {})
for pid, p in pages.items():
if pid == '-1': continue
return p.get('extract', ''), p.get('fullurl', '')
return None, None
def harvest():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
total_facts = 0
total_pages = 0
for cat in CATEGORIES:
log.info(f"=== Kategorija: {cat} ===")
pages = category_members(cat, limit=200)
log.info(f" pages: {len(pages)}")
for title in pages:
try:
# Skip ako već imamo
cur.execute("SELECT 1 FROM dabi.knowledge WHERE source = 'wiki_hr' AND fact LIKE %s LIMIT 1",
(f'{title[:50]}%',))
if cur.fetchone():
continue
extract, url = fetch_page_extract(title)
time.sleep(0.5)
if not extract or len(extract) < 200: continue
# Razdvoji na chunks (svaki chunk = jedan fact)
# Prvi chunk je intro (najvažniji)
chunks = []
first_chunk = extract[:1500]
chunks.append((title + "" + first_chunk, 0.92))
# Sljedeći chunks (manje confidence)
for i in range(1500, min(len(extract), 6000), 1500):
chunks.append((title + "" + extract[i:i+1500], 0.85))
for chunk_text, conf in chunks:
fh = hashlib.sha256(f"wiki:{title}:{chunk_text[:80]}".encode()).hexdigest()[:32]
refs = json.dumps([{"url": url, "title": title, "wikipedia": "hr"}])
try:
cur.execute("""
INSERT INTO dabi.knowledge
(fact, category, source, source_refs, confidence, data_hash, created_at)
VALUES (%s, %s, 'wiki_hr', %s::jsonb, %s, %s, now())
ON CONFLICT (data_hash) DO NOTHING
""", (chunk_text[:2000], f'wiki_{cat[:30]}', refs, conf, fh))
total_facts += cur.rowcount
except Exception as e:
if total_facts < 5: log.warning(f"insert: {e}")
total_pages += 1
if total_pages % 20 == 0:
log.info(f" Progress: {total_pages} pages, {total_facts} facts")
except Exception as e:
log.warning(f" Page fail '{title}': {e}")
continue
log.info(f" Done {cat}: total facts={total_facts}")
log.info(f"═══ FINAL: {total_pages} pages, {total_facts} facts ═══")
cur.close(); conn.close()
if __name__ == "__main__":
harvest()
+110
View File
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.5
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def query_wiki(name, lang="hr"):
"""Search + page details w/ pageimages."""
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","list":"search",
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
sr = r.json().get("query",{}).get("search",[])
if not sr: return None
candidates = [x["title"] for x in sr]
except: return None
# Pick first candidate that contains key word from name
key = name.split()[-1].lower()
for title in candidates:
if key not in title.lower(): continue
time.sleep(DELAY)
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json",
"prop":"extracts|pageimages|info",
"exintro":1,"explaintext":1,
"piprop":"original|thumbnail","pithumbsize":500,
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
extract = p.get("extract","")
if not extract: continue
# Sport context check
el = extract.lower()
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
continue
logo = (p.get("thumbnail",{}).get("source") or
p.get("original",{}).get("source"))
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
except: continue
return None
def enrich_klub(naziv):
# Try variants
variants = [naziv]
# Strip common prefixes
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
if base != naziv: variants.append(base)
if "Rijeka" not in naziv and base != naziv:
variants.append(f"{base} Rijeka")
for v in variants:
for lang in ["hr","en"]:
r = query_wiki(v, lang)
if r: return r
return None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Top klubovi: most trofeji + svjetski medalisti, missing logo
cr.execute("""
WITH top_klubovi AS (
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
FROM pgz_sport.klubovi k
WHERE k.id != 4426 AND k.aktivan=true
)
SELECT id, naziv FROM top_klubovi
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
""")
todo = cr.fetchall()
print(f"Klubovi to enrich (logo): {len(todo)}")
success = 0
for kid, naziv in todo:
print(f"{naziv}", end="", flush=True)
r = enrich_klub(naziv)
if not r:
print(" MISS"); continue
sets, vals = [], []
if r.get("logo"):
sets.append("logo_url = %s"); vals.append(r["logo"])
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
sets.append("source_synced_at = now()")
vals.append(kid)
try:
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = " +LOGO" if r.get("logo") else ""
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
except Exception as e:
print(f" DBerr: {e}")
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
conn.close()
+110
View File
@@ -0,0 +1,110 @@
#!/usr/bin/env python3
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.5
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def query_wiki(name, lang="hr"):
"""Search + page details w/ pageimages."""
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","list":"search",
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
sr = r.json().get("query",{}).get("search",[])
if not sr: return None
candidates = [x["title"] for x in sr]
except: return None
# Pick first candidate that contains key word from name
key = name.split()[-1].lower()
for title in candidates:
if key not in title.lower(): continue
time.sleep(DELAY)
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json",
"prop":"extracts|pageimages|info",
"exintro":1,"explaintext":1,
"piprop":"original|thumbnail","pithumbsize":500,
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
extract = p.get("extract","")
if not extract: continue
# Sport context check
el = extract.lower()
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
continue
logo = (p.get("thumbnail",{}).get("source") or
p.get("original",{}).get("source"))
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
except: continue
return None
def enrich_klub(naziv):
# Try variants
variants = [naziv]
# Strip common prefixes
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
if base != naziv: variants.append(base)
if "Rijeka" not in naziv and base != naziv:
variants.append(f"{base} Rijeka")
for v in variants:
for lang in ["hr","en"]:
r = query_wiki(v, lang)
if r: return r
return None
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
# Top klubovi: most trofeji + svjetski medalisti, missing logo
cr.execute("""
WITH top_klubovi AS (
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
FROM pgz_sport.klubovi k
WHERE k.id != 4426 AND k.aktivan=true
)
SELECT id, naziv FROM top_klubovi
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
""")
todo = cr.fetchall()
print(f"Klubovi to enrich (logo): {len(todo)}")
success = 0
for kid, naziv in todo:
print(f" → {naziv}", end="", flush=True)
r = enrich_klub(naziv)
if not r:
print(" MISS"); continue
sets, vals = [], []
if r.get("logo"):
sets.append("logo_url = %s"); vals.append(r["logo"])
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
sets.append("source_synced_at = now()")
vals.append(kid)
try:
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = " +LOGO" if r.get("logo") else ""
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
except Exception as e:
print(f" DBerr: {e}")
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
conn.close()
+132
View File
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.4
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def get_wikidata_id(title, lang):
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","prop":"pageprops",
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
qid = p.get("pageprops",{}).get("wikibase_item")
if qid: return qid
except: pass
return None
def get_wikidata_entity(qid):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
return r.json().get("entities",{}).get(qid)
except: return None
def get_label(qid, lang="hr"):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
ent = r.json().get("entities",{}).get(qid,{})
labels = ent.get("labels",{})
return (labels.get(lang,{}).get("value") or
labels.get("en",{}).get("value") or
labels.get("hr",{}).get("value"))
except: return None
def parse_birth(entity):
out = {}
if not entity: return out
claims = entity.get("claims",{})
for prop in ["P569"]:
for c in claims.get(prop,[]):
try:
t = c["mainsnak"]["datavalue"]["value"]["time"]
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
if m:
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
break
except: continue
for prop in ["P19"]:
for c in claims.get(prop,[]):
try:
qid = c["mainsnak"]["datavalue"]["value"]["id"]
lbl = get_label(qid,"hr")
time.sleep(DELAY)
if lbl:
out["mjesto_rodenja"] = lbl[:100]
break
except: continue
return out
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
FROM pgz_sport.clanovi
WHERE source_url LIKE '%wikipedia.org/wiki/%'
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
""")
todo = cr.fetchall()
print(f"Pending: {len(todo)}")
success = 0
for cid, ime, prezime, source_url, dob, mjesto in todo:
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
if not m: continue
lang = m.group(1)
raw = m.group(2)
title = urllib.parse.unquote(raw).replace("_"," ")
qid = get_wikidata_id(title, lang)
time.sleep(DELAY)
if not qid:
# Try alternate lang
alt = "en" if lang == "hr" else "hr"
qid = get_wikidata_id(title, alt)
time.sleep(DELAY)
if not qid:
print(f"{ime} {prezime}: no Q-id"); continue
entity = get_wikidata_entity(qid)
time.sleep(DELAY)
parsed = parse_birth(entity)
if not parsed:
print(f"{ime} {prezime} ({qid}): no birth data"); continue
sets, vals = [], []
if parsed.get("datum_rodenja") and not dob:
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
if parsed.get("mjesto_rodenja") and not mjesto:
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
if sets:
sets.append("source_synced_at = now()")
vals.append(cid)
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = []
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
print(f"{ime} {prezime} ({qid}): {' '.join(flags)}")
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
# Re-stats
cr.execute("""SELECT
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
FROM pgz_sport.clanovi""")
r = cr.fetchone()
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
conn.close()
+132
View File
@@ -0,0 +1,132 @@
#!/usr/bin/env python3
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
import psycopg2, requests, re, json, time, urllib.parse
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
user="rinet", password="R1net2026!SecureDB#v7")
UA = "RiNET-Civic/1.0 (https://rinet.one)"
TIMEOUT = 20
DELAY = 0.4
s = requests.Session()
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
def get_wikidata_id(title, lang):
try:
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
params={"action":"query","format":"json","prop":"pageprops",
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
pages = r.json().get("query",{}).get("pages",{})
for pid, p in pages.items():
if pid == "-1": continue
qid = p.get("pageprops",{}).get("wikibase_item")
if qid: return qid
except: pass
return None
def get_wikidata_entity(qid):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
return r.json().get("entities",{}).get(qid)
except: return None
def get_label(qid, lang="hr"):
try:
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
ent = r.json().get("entities",{}).get(qid,{})
labels = ent.get("labels",{})
return (labels.get(lang,{}).get("value") or
labels.get("en",{}).get("value") or
labels.get("hr",{}).get("value"))
except: return None
def parse_birth(entity):
out = {}
if not entity: return out
claims = entity.get("claims",{})
for prop in ["P569"]:
for c in claims.get(prop,[]):
try:
t = c["mainsnak"]["datavalue"]["value"]["time"]
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
if m:
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
break
except: continue
for prop in ["P19"]:
for c in claims.get(prop,[]):
try:
qid = c["mainsnak"]["datavalue"]["value"]["id"]
lbl = get_label(qid,"hr")
time.sleep(DELAY)
if lbl:
out["mjesto_rodenja"] = lbl[:100]
break
except: continue
return out
# === MAIN ===
conn = psycopg2.connect(**DB); conn.autocommit = True
cr = conn.cursor()
cr.execute("""
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
FROM pgz_sport.clanovi
WHERE source_url LIKE '%wikipedia.org/wiki/%'
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
""")
todo = cr.fetchall()
print(f"Pending: {len(todo)}")
success = 0
for cid, ime, prezime, source_url, dob, mjesto in todo:
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
if not m: continue
lang = m.group(1)
raw = m.group(2)
title = urllib.parse.unquote(raw).replace("_"," ")
qid = get_wikidata_id(title, lang)
time.sleep(DELAY)
if not qid:
# Try alternate lang
alt = "en" if lang == "hr" else "hr"
qid = get_wikidata_id(title, alt)
time.sleep(DELAY)
if not qid:
print(f" ✗ {ime} {prezime}: no Q-id"); continue
entity = get_wikidata_entity(qid)
time.sleep(DELAY)
parsed = parse_birth(entity)
if not parsed:
print(f" ✗ {ime} {prezime} ({qid}): no birth data"); continue
sets, vals = [], []
if parsed.get("datum_rodenja") and not dob:
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
if parsed.get("mjesto_rodenja") and not mjesto:
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
if sets:
sets.append("source_synced_at = now()")
vals.append(cid)
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
success += 1
flags = []
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
print(f" ✓ {ime} {prezime} ({qid}): {' '.join(flags)}")
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
# Re-stats
cr.execute("""SELECT
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
FROM pgz_sport.clanovi""")
r = cr.fetchone()
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
conn.close()