PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
Executable
+142
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
|
||||
Faster + more reliable than search-based approaches.
|
||||
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
||||
|
||||
def out(msg): print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
|
||||
CURATED = [
|
||||
# Olympic medalists (PGŽ historical heroes)
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
|
||||
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
|
||||
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
|
||||
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
|
||||
# 2025 stars
|
||||
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
|
||||
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
|
||||
("Sandra Delija", None, "Sandra Delija"),
|
||||
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
|
||||
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
|
||||
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
|
||||
# Football (HNK Rijeka stars)
|
||||
("Niko Janković", None, None),
|
||||
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
|
||||
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
|
||||
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
|
||||
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
|
||||
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
|
||||
("Duje Čop", "Duje Čop", "Duje Čop"),
|
||||
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
|
||||
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
|
||||
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
|
||||
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
|
||||
("Cherno Saho", None, "Cherno Saho"),
|
||||
("Bruno Goda", None, None),
|
||||
("Marco Pašalić", None, "Marco Pašalić"),
|
||||
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
|
||||
# Coaches
|
||||
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
|
||||
# Vaterpolo PGŽ
|
||||
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
|
||||
# Boćanje legends
|
||||
("Karlo Šaban", None, None),
|
||||
("Carrolina Ban", None, None),
|
||||
# Karate
|
||||
("Ema Sgardelli", None, "Ema Sgardelli"),
|
||||
# Atletika
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
enriched = 0; tried = 0
|
||||
for full, hr_title, en_title in CURATED:
|
||||
tried += 1
|
||||
ime, prez = full.split(" ", 1) if " " in full else (full, "")
|
||||
# Find clan record
|
||||
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
LIMIT 1""", (ime, prez))
|
||||
row = cr.fetchone()
|
||||
if not row:
|
||||
out(f" - {full} not in clanovi"); continue
|
||||
cid, sport, klub_id = row
|
||||
|
||||
# Fetch wiki - try hr first then en
|
||||
s = None; wlang = None
|
||||
for lang, title in [("hr", hr_title), ("en", en_title)]:
|
||||
if not title: continue
|
||||
s = wiki_summary(title, lang)
|
||||
if s and s.get("type") in ("standard", None):
|
||||
wlang = lang; break
|
||||
time.sleep(0.2)
|
||||
|
||||
if not s or not s.get("extract"):
|
||||
out(f" ✗ {full} - no wiki page")
|
||||
continue
|
||||
|
||||
extract = s["extract"].strip()[:1500]
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s, source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, wurl, cid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
|
||||
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
|
||||
|
||||
# Summary
|
||||
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
|
||||
total = cr.fetchone()[0]
|
||||
out(f"\nTotal sportaša s bio > 200 chars: {total}")
|
||||
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
|
||||
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
|
||||
out("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+142
@@ -0,0 +1,142 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D curated: hand-curated wiki titles + extracts for top PGŽ athletes.
|
||||
Faster + more reliable than search-based approaches.
|
||||
For each known athlete, hardcode wiki title (HR/EN) and pull summary directly.
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
||||
|
||||
def out(msg): print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
# Curated list: (full_name, hr_wiki_title, en_wiki_title) - athletes with known wiki entries
|
||||
CURATED = [
|
||||
# Olympic medalists (PGŽ historical heroes)
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
("Duje Draganja", "Duje Draganja", "Duje Draganja"),
|
||||
("Mirza Džomba", "Mirza Džomba", "Mirza Džomba"),
|
||||
("Luciano Sušanj", "Luciano Sušanj", "Luciano Sušanj"),
|
||||
("Damir Skomina", "Damir Skomina", "Damir Skomina"),
|
||||
# 2025 stars
|
||||
("Petar Klovar", "Petar Klovar", "Petar Klovar"),
|
||||
("Vitomir Maričić", "Vitomir Maričić", "Vitomir Maričić"),
|
||||
("Sandra Delija", None, "Sandra Delija"),
|
||||
("Laura Štefanac", "Laura Štefanac", "Laura Štefanac"),
|
||||
("Ivan Šarić", "Ivan Šarić (šahist)", "Ivan Šarić (chess player)"),
|
||||
("Damir Kreilach", "Damir Kreilach", "Damir Kreilach"),
|
||||
# Football (HNK Rijeka stars)
|
||||
("Niko Janković", None, None),
|
||||
("Ante Majstorović", "Ante Majstorović", "Ante Majstorović"),
|
||||
("Toni Fruk", "Toni Fruk", "Toni Fruk"),
|
||||
("Stjepan Radeljić", "Stjepan Radeljić", "Stjepan Radeljić"),
|
||||
("Niko Galešić", "Niko Galešić", "Niko Galešić"),
|
||||
("Bruno Bogojević", "Bruno Bogojević", "Bruno Bogojević"),
|
||||
("Duje Čop", "Duje Čop", "Duje Čop"),
|
||||
("Luka Menalo", "Luka Menalo", "Luka Menalo"),
|
||||
("Mile Škorić", "Mile Škorić", "Mile Škorić"),
|
||||
("Stipe Perica", "Stipe Perica", "Stipe Perica"),
|
||||
("Marijan Čabraja", "Marijan Čabraja", "Marijan Čabraja"),
|
||||
("Cherno Saho", None, "Cherno Saho"),
|
||||
("Bruno Goda", None, None),
|
||||
("Marco Pašalić", None, "Marco Pašalić"),
|
||||
("Amer Gojak", "Amer Gojak", "Amer Gojak"),
|
||||
# Coaches
|
||||
("Radomir Đalović", "Radomir Đalović", "Radomir Đalović"),
|
||||
# Vaterpolo PGŽ
|
||||
("Tin Brubnjak", "Tin Brubnjak", "Tin Brubnjak"),
|
||||
# Boćanje legends
|
||||
("Karlo Šaban", None, None),
|
||||
("Carrolina Ban", None, None),
|
||||
# Karate
|
||||
("Ema Sgardelli", None, "Ema Sgardelli"),
|
||||
# Atletika
|
||||
("Sara Kolak", "Sara Kolak", "Sara Kolak"),
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
enriched = 0; tried = 0
|
||||
for full, hr_title, en_title in CURATED:
|
||||
tried += 1
|
||||
ime, prez = full.split(" ", 1) if " " in full else (full, "")
|
||||
# Find clan record
|
||||
cr.execute("""SELECT id, sport, klub_id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
LIMIT 1""", (ime, prez))
|
||||
row = cr.fetchone()
|
||||
if not row:
|
||||
out(f" - {full} not in clanovi"); continue
|
||||
cid, sport, klub_id = row
|
||||
|
||||
# Fetch wiki - try hr first then en
|
||||
s = None; wlang = None
|
||||
for lang, title in [("hr", hr_title), ("en", en_title)]:
|
||||
if not title: continue
|
||||
s = wiki_summary(title, lang)
|
||||
if s and s.get("type") in ("standard", None):
|
||||
wlang = lang; break
|
||||
time.sleep(0.2)
|
||||
|
||||
if not s or not s.get("extract"):
|
||||
out(f" ✗ {full} - no wiki page")
|
||||
continue
|
||||
|
||||
extract = s["extract"].strip()[:1500]
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page", "")
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s, source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, wurl, cid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}) — {extract[:600]} (Wikipedia {wlang.upper()})"[:2000],
|
||||
f"wikipedia_{wlang}", 0.95, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{wlang}] {full} - {len(extract)} chars")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
time.sleep(0.3)
|
||||
|
||||
out(f"\n=== DONE: tried={tried} enriched={enriched} ===")
|
||||
|
||||
# Summary
|
||||
cr.execute("""SELECT count(*) FROM pgz_sport.clanovi WHERE LENGTH(biografija) > 200""")
|
||||
total = cr.fetchone()[0]
|
||||
out(f"\nTotal sportaša s bio > 200 chars: {total}")
|
||||
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio FROM pgz_sport.clanovi
|
||||
WHERE LENGTH(biografija) > 200 ORDER BY bio DESC LIMIT 15""")
|
||||
out("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
out(f" {r[0]:18} {r[1]:18} {r[2]:15} {r[3]} chars")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+116
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr
|
||||
search pages directly (not via DDG which is blocked).
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
||||
|
||||
def out(msg): print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=12):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA, "Accept-Language": "hr,en"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def html_to_text(h):
|
||||
h = re.sub(r'<script.*?</script>', '', h, flags=re.S)
|
||||
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
||||
h = re.sub(r'<[^>]+>', ' ', h)
|
||||
h = re.sub(r' ', ' ', h)
|
||||
h = re.sub(r'&', '&', h)
|
||||
h = re.sub(r'&[a-z]+;', '', h)
|
||||
h = re.sub(r'\s+', ' ', h).strip()
|
||||
return h
|
||||
|
||||
def relevant_paragraph(text, ime, prez, sport):
|
||||
sents = re.split(r'(?<=[.!?])\s+', text)
|
||||
relevant = []
|
||||
for s in sents:
|
||||
sl = s.lower()
|
||||
if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60:
|
||||
relevant.append(s)
|
||||
if len(" ".join(relevant)) > 700: break
|
||||
return " ".join(relevant)[:1300]
|
||||
|
||||
def search_novilist(query):
|
||||
"""Novi list search: direct URL"""
|
||||
enc = urllib.parse.quote(query)
|
||||
h = http_get(f"https://www.novilist.hr/?s={enc}")
|
||||
if not h: return []
|
||||
return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
sport_kw = sport or "sportaš"
|
||||
|
||||
# Try Novi list (Riječki regional)
|
||||
urls = search_novilist(f"{full} {sport_kw}")
|
||||
time.sleep(0.4)
|
||||
|
||||
bio_text = ""; bio_url = None
|
||||
for u in urls[:3]:
|
||||
if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")):
|
||||
continue
|
||||
html = http_get(u, timeout=10)
|
||||
if not html: continue
|
||||
text = html_to_text(html)
|
||||
para = relevant_paragraph(text, ime, prez, sport_kw)
|
||||
if para and len(para) >= 200:
|
||||
bio_text = para; bio_url = u
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
if not bio_text:
|
||||
time.sleep(0.3)
|
||||
continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""", (bio_text, bio_url, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000],
|
||||
"novilist", 0.85, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
if tried % 10 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.4)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+116
@@ -0,0 +1,116 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D PGŽ news enrichment: scrape novilist.hr, glasistre.hr, sportske.jutarnji.hr
|
||||
search pages directly (not via DDG which is blocked).
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (compatible; PGZBot/1.0)"
|
||||
|
||||
def out(msg): print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=12):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA, "Accept-Language": "hr,en"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def html_to_text(h):
|
||||
h = re.sub(r'<script.*?</script>', '', h, flags=re.S)
|
||||
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
||||
h = re.sub(r'<[^>]+>', ' ', h)
|
||||
h = re.sub(r' ', ' ', h)
|
||||
h = re.sub(r'&', '&', h)
|
||||
h = re.sub(r'&[a-z]+;', '', h)
|
||||
h = re.sub(r'\s+', ' ', h).strip()
|
||||
return h
|
||||
|
||||
def relevant_paragraph(text, ime, prez, sport):
|
||||
sents = re.split(r'(?<=[.!?])\s+', text)
|
||||
relevant = []
|
||||
for s in sents:
|
||||
sl = s.lower()
|
||||
if (prez.lower() in sl or f"{ime.lower()} {prez.lower()}" in sl) and len(s) > 60:
|
||||
relevant.append(s)
|
||||
if len(" ".join(relevant)) > 700: break
|
||||
return " ".join(relevant)[:1300]
|
||||
|
||||
def search_novilist(query):
|
||||
"""Novi list search: direct URL"""
|
||||
enc = urllib.parse.quote(query)
|
||||
h = http_get(f"https://www.novilist.hr/?s={enc}")
|
||||
if not h: return []
|
||||
return re.findall(r'href="(https://www\.novilist\.hr/[^"]+)"', h)[:5]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 50""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
sport_kw = sport or "sportaš"
|
||||
|
||||
# Try Novi list (Riječki regional)
|
||||
urls = search_novilist(f"{full} {sport_kw}")
|
||||
time.sleep(0.4)
|
||||
|
||||
bio_text = ""; bio_url = None
|
||||
for u in urls[:3]:
|
||||
if any(skip in u for skip in ("autor", "kategorija", "tag", "wp-content", "feed", "page=")):
|
||||
continue
|
||||
html = http_get(u, timeout=10)
|
||||
if not html: continue
|
||||
text = html_to_text(html)
|
||||
para = relevant_paragraph(text, ime, prez, sport_kw)
|
||||
if para and len(para) >= 200:
|
||||
bio_text = para; bio_url = u
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
if not bio_text:
|
||||
time.sleep(0.3)
|
||||
continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""", (bio_text, bio_url, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: novilist.hr)"[:2000],
|
||||
"novilist", 0.85, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
if tried % 10 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.4)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+139
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
|
||||
Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=12):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA,
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "hr,en"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def ddg_search(query, limit=3):
|
||||
"""DuckDuckGo HTML search. Returns list of (url, snippet)."""
|
||||
q = urllib.parse.quote(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={q}"
|
||||
h = http_get(url)
|
||||
if not h: return []
|
||||
results = []
|
||||
# Extract <a class="result__a" href="..."> or "result__url"
|
||||
for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
|
||||
link = urllib.parse.unquote(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
# Strip DDG redirect
|
||||
m2 = re.search(r'uddg=([^&]+)', link)
|
||||
if m2: link = urllib.parse.unquote(m2.group(1))
|
||||
results.append((link, title))
|
||||
if len(results) >= limit: break
|
||||
return results
|
||||
|
||||
def html_to_text(html):
|
||||
h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
|
||||
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
||||
h = re.sub(r'<[^>]+>', ' ', h)
|
||||
h = re.sub(r' ', ' ', h)
|
||||
h = re.sub(r'&', '&', h)
|
||||
h = re.sub(r'"', '"', h)
|
||||
h = re.sub(r'&#\d+;', '', h)
|
||||
h = re.sub(r'\s+', ' ', h)
|
||||
return h
|
||||
|
||||
def relevant_paragraph(text, ime, prez, sport):
|
||||
"""Extract first relevant sentence(s) that mention name + sport."""
|
||||
sents = re.split(r'(?<=[.!?])\s+', text)
|
||||
full_name = f"{ime} {prez}"
|
||||
relevant = []
|
||||
for s in sents:
|
||||
sl = s.lower()
|
||||
if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
|
||||
relevant.append(s)
|
||||
if len(" ".join(relevant)) > 800: break
|
||||
return " ".join(relevant)[:1500]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
sport_kw = sport or "sportaš"
|
||||
# DuckDuckGo query
|
||||
query = f'"{full}" {sport_kw} Rijeka'
|
||||
results = ddg_search(query, limit=3)
|
||||
if not results:
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
bio_text = ""
|
||||
bio_url = None
|
||||
for link, title in results[:3]:
|
||||
# Skip non-news domains: facebook, instagram, hns.family etc
|
||||
if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
|
||||
"x.com", "tiktok")): continue
|
||||
html = http_get(link, timeout=10)
|
||||
if not html: continue
|
||||
text = html_to_text(html)
|
||||
para = relevant_paragraph(text, ime, prez, sport_kw)
|
||||
if para and len(para) >= 200:
|
||||
bio_text = para
|
||||
bio_url = link
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
if not bio_text:
|
||||
time.sleep(0.4)
|
||||
continue
|
||||
|
||||
# Insert
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""", (bio_text, bio_url, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
|
||||
"online_news", 0.85, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
if tried % 15 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.5)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+139
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D news enrichment: search via DuckDuckGo HTML (no API key) for biographical news mentions.
|
||||
Gather first 2-3 hits, scrape, extract relevant paragraphs that mention the athlete + sport.
|
||||
"""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (X11; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=12):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": UA,
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "hr,en"
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def ddg_search(query, limit=3):
|
||||
"""DuckDuckGo HTML search. Returns list of (url, snippet)."""
|
||||
q = urllib.parse.quote(query)
|
||||
url = f"https://html.duckduckgo.com/html/?q={q}"
|
||||
h = http_get(url)
|
||||
if not h: return []
|
||||
results = []
|
||||
# Extract <a class="result__a" href="..."> or "result__url"
|
||||
for m in re.finditer(r'<a [^>]*class="result__a"[^>]*href="([^"]+)"[^>]*>([^<]+)</a>', h):
|
||||
link = urllib.parse.unquote(m.group(1))
|
||||
title = m.group(2).strip()
|
||||
# Strip DDG redirect
|
||||
m2 = re.search(r'uddg=([^&]+)', link)
|
||||
if m2: link = urllib.parse.unquote(m2.group(1))
|
||||
results.append((link, title))
|
||||
if len(results) >= limit: break
|
||||
return results
|
||||
|
||||
def html_to_text(html):
|
||||
h = re.sub(r'<script.*?</script>', '', html, flags=re.S)
|
||||
h = re.sub(r'<style.*?</style>', '', h, flags=re.S)
|
||||
h = re.sub(r'<[^>]+>', ' ', h)
|
||||
h = re.sub(r' ', ' ', h)
|
||||
h = re.sub(r'&', '&', h)
|
||||
h = re.sub(r'"', '"', h)
|
||||
h = re.sub(r'&#\d+;', '', h)
|
||||
h = re.sub(r'\s+', ' ', h)
|
||||
return h
|
||||
|
||||
def relevant_paragraph(text, ime, prez, sport):
|
||||
"""Extract first relevant sentence(s) that mention name + sport."""
|
||||
sents = re.split(r'(?<=[.!?])\s+', text)
|
||||
full_name = f"{ime} {prez}"
|
||||
relevant = []
|
||||
for s in sents:
|
||||
sl = s.lower()
|
||||
if (full_name.lower() in sl or prez.lower() in sl) and len(s) > 50:
|
||||
relevant.append(s)
|
||||
if len(" ".join(relevant)) > 800: break
|
||||
return " ".join(relevant)[:1500]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 80""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
sport_kw = sport or "sportaš"
|
||||
# DuckDuckGo query
|
||||
query = f'"{full}" {sport_kw} Rijeka'
|
||||
results = ddg_search(query, limit=3)
|
||||
if not results:
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
bio_text = ""
|
||||
bio_url = None
|
||||
for link, title in results[:3]:
|
||||
# Skip non-news domains: facebook, instagram, hns.family etc
|
||||
if any(skip in link.lower() for skip in ("facebook", "instagram", "twitter", "youtube",
|
||||
"x.com", "tiktok")): continue
|
||||
html = http_get(link, timeout=10)
|
||||
if not html: continue
|
||||
text = html_to_text(html)
|
||||
para = relevant_paragraph(text, ime, prez, sport_kw)
|
||||
if para and len(para) >= 200:
|
||||
bio_text = para
|
||||
bio_url = link
|
||||
break
|
||||
time.sleep(0.3)
|
||||
|
||||
if not bio_text:
|
||||
time.sleep(0.4)
|
||||
continue
|
||||
|
||||
# Insert
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""", (bio_text, bio_url, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {bio_text[:600]} (izvor: {bio_url})"[:2000],
|
||||
"online_news", 0.85, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ {full} - {len(bio_text)} chars from {bio_url[:80]}")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
|
||||
if tried % 15 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.5)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+172
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D: Wikipedia/online enrichment for top sportašs.
|
||||
For each athlete: fetch hr.wikipedia + en.wikipedia summary,
|
||||
extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts.
|
||||
"""
|
||||
import re, json, time
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
DELAY = 0.5
|
||||
|
||||
def http_get(url, timeout=15):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404: return None
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
"""Use Wikipedia REST API for clean summary."""
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
"""Find best Wikipedia title for a person."""
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3])) # (title, url)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Get target list: HOO kat 1-3 + SP/EP/OI medalisti
|
||||
cr.execute("""
|
||||
SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv
|
||||
FROM pgz_sport.clanovi c
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))
|
||||
ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime
|
||||
""")
|
||||
targets = cr.fetchall()
|
||||
print(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0
|
||||
fact_count = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, bio, klub in targets:
|
||||
if bio and len(bio) > 200:
|
||||
continue # already enriched
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
# Try HR wiki first
|
||||
summary = None
|
||||
wiki_title = None
|
||||
wiki_lang = None
|
||||
wiki_url = None
|
||||
|
||||
for lang in ["hr", "en"]:
|
||||
# Direct title try
|
||||
s = wiki_summary(full, lang)
|
||||
if s and s.get("type") == "standard" and not s.get("disambiguation"):
|
||||
# Sanity check: must mention sport in description or extract
|
||||
desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu",
|
||||
"ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač",
|
||||
"plivat", "boxer", "olimpij", "bonifac", "athlete", "compete",
|
||||
"sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")):
|
||||
summary = s
|
||||
wiki_title = s.get("title")
|
||||
wiki_lang = lang
|
||||
wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Search fallback
|
||||
results = wiki_search(full, lang)
|
||||
for title, url in results:
|
||||
# Skip disambiguations
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower():
|
||||
continue
|
||||
s2 = wiki_summary(title, lang)
|
||||
if not s2: continue
|
||||
desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič",
|
||||
"ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete",
|
||||
"compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")):
|
||||
# Must mention sport-relevant keyword OR our sport
|
||||
if sport and sport.lower()[:5] in desc:
|
||||
summary = s2; wiki_title = title; wiki_lang = lang
|
||||
wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
if summary: break
|
||||
|
||||
if not summary or not summary.get("extract"):
|
||||
continue
|
||||
|
||||
extract = summary.get("extract", "").strip()[:1500]
|
||||
if len(extract) < 80:
|
||||
continue
|
||||
|
||||
# Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY)
|
||||
dob = None
|
||||
m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower())
|
||||
if m: dob = m.group(1)
|
||||
|
||||
# Update clanovi
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source = COALESCE(source, %s),
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, f"wikipedia_{wiki_lang}", wiki_url, tid))
|
||||
enriched += 1
|
||||
except Exception as e:
|
||||
print(f" ERR update {full}: {e}")
|
||||
continue
|
||||
|
||||
# Insert as fact in dabi.knowledge
|
||||
fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})"
|
||||
try:
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa"))
|
||||
if cr.rowcount: fact_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f" ✓ {full} ({wiki_lang}) {len(extract)} chars")
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===")
|
||||
|
||||
# Sample bios
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100
|
||||
AND source LIKE 'wikipedia%'
|
||||
ORDER BY bio_len DESC LIMIT 15""")
|
||||
print("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+172
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
D: Wikipedia/online enrichment for top sportašs.
|
||||
For each athlete: fetch hr.wikipedia + en.wikipedia summary,
|
||||
extract bio + medalje + datum/mjesto rođenja, populate clanovi.biografija + dabi.knowledge facts.
|
||||
"""
|
||||
import re, json, time
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
DELAY = 0.5
|
||||
|
||||
def http_get(url, timeout=15):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code == 404: return None
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
"""Use Wikipedia REST API for clean summary."""
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
"""Find best Wikipedia title for a person."""
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=3&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3])) # (title, url)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Get target list: HOO kat 1-3 + SP/EP/OI medalisti
|
||||
cr.execute("""
|
||||
SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, c.biografija, k.naziv
|
||||
FROM pgz_sport.clanovi c
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK'))
|
||||
ORDER BY c.kategorija_hoo NULLS LAST, c.prezime, c.ime
|
||||
""")
|
||||
targets = cr.fetchall()
|
||||
print(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0
|
||||
fact_count = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, bio, klub in targets:
|
||||
if bio and len(bio) > 200:
|
||||
continue # already enriched
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
# Try HR wiki first
|
||||
summary = None
|
||||
wiki_title = None
|
||||
wiki_lang = None
|
||||
wiki_url = None
|
||||
|
||||
for lang in ["hr", "en"]:
|
||||
# Direct title try
|
||||
s = wiki_summary(full, lang)
|
||||
if s and s.get("type") == "standard" and not s.get("disambiguation"):
|
||||
# Sanity check: must mention sport in description or extract
|
||||
desc = (s.get("description","") + " " + s.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igra", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu",
|
||||
"ju-jitsu", "sahist", "šahis", "atlet", "biciklist", "plivač",
|
||||
"plivat", "boxer", "olimpij", "bonifac", "athlete", "compete",
|
||||
"sportaš", "swimmer", "diver", "boxer", "sailor", "wrestler")):
|
||||
summary = s
|
||||
wiki_title = s.get("title")
|
||||
wiki_lang = lang
|
||||
wiki_url = s.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Search fallback
|
||||
results = wiki_search(full, lang)
|
||||
for title, url in results:
|
||||
# Skip disambiguations
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower():
|
||||
continue
|
||||
s2 = wiki_summary(title, lang)
|
||||
if not s2: continue
|
||||
desc = (s2.get("description","") + " " + s2.get("extract",""))[:2000].lower()
|
||||
if any(kw in desc for kw in ("sport", "igrač", "klub", "natjecat", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "atletič",
|
||||
"ju-jitsu", "šahist", "biciklist", "plivač", "olimpij", "athlete",
|
||||
"compete", "sportaš", "swimmer", "diver", "sailor", "wrestler")):
|
||||
# Must mention sport-relevant keyword OR our sport
|
||||
if sport and sport.lower()[:5] in desc:
|
||||
summary = s2; wiki_title = title; wiki_lang = lang
|
||||
wiki_url = s2.get("content_urls", {}).get("desktop", {}).get("page")
|
||||
break
|
||||
time.sleep(DELAY)
|
||||
if summary: break
|
||||
|
||||
if not summary or not summary.get("extract"):
|
||||
continue
|
||||
|
||||
extract = summary.get("extract", "").strip()[:1500]
|
||||
if len(extract) < 80:
|
||||
continue
|
||||
|
||||
# Try to extract date of birth from extract (pattern: rođen* DD.MM.YYYY or DD month YYYY)
|
||||
dob = None
|
||||
m = re.search(r"rođen[ai]?\s+(\d{1,2}\.\s*\w+\s+\d{4}|\d{1,2}\.\d{1,2}\.\d{4})", extract.lower())
|
||||
if m: dob = m.group(1)
|
||||
|
||||
# Update clanovi
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source = COALESCE(source, %s),
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract, f"wikipedia_{wiki_lang}", wiki_url, tid))
|
||||
enriched += 1
|
||||
except Exception as e:
|
||||
print(f" ERR update {full}: {e}")
|
||||
continue
|
||||
|
||||
# Insert as fact in dabi.knowledge
|
||||
fact = f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {wiki_lang.upper()})"
|
||||
try:
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(fact[:2000], f"wikipedia_{wiki_lang}", 0.9, "biografija_sportasa"))
|
||||
if cr.rowcount: fact_count += 1
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
print(f" ✓ {full} ({wiki_lang}) {len(extract)} chars")
|
||||
time.sleep(DELAY)
|
||||
|
||||
print(f"\n=== DONE: {enriched} enriched, {fact_count} new facts ===")
|
||||
|
||||
# Sample bios
|
||||
cr.execute("""SELECT ime, prezime, sport, LENGTH(biografija) AS bio_len
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE biografija IS NOT NULL AND LENGTH(biografija) > 100
|
||||
AND source LIKE 'wikipedia%'
|
||||
ORDER BY bio_len DESC LIMIT 15""")
|
||||
print("\nTop bios:")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]} {r[1]} ({r[2]}) - {r[3]} chars")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+94
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D v2: simpler & faster wiki enrichment with stdout flush."""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
sys.stdout.flush()
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 100""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
# Try direct title match HR + EN
|
||||
for lang in ["hr", "en"]:
|
||||
s = wiki_summary(full, lang)
|
||||
if not s: continue
|
||||
if s.get("type") not in ("standard", None): continue
|
||||
extract = (s.get("extract") or "").strip()
|
||||
if not extract or len(extract) < 80: continue
|
||||
# Quality check: must mention sport keyword
|
||||
t = (extract + " " + (s.get("description") or "")).lower()
|
||||
sport_kws = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist",
|
||||
"plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer",
|
||||
"diver", "skier", "sailor", "vesla", "ringa", "gimnast")
|
||||
if not any(kw in t for kw in sport_kws):
|
||||
continue
|
||||
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
|
||||
if not wurl: continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source = COALESCE(NULLIF(source, ''), %s),
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract[:1500], f"wikipedia_{lang}", wurl, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
|
||||
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{lang}] {full} - {len(extract)} chars")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
break # found, no need to try other lang
|
||||
|
||||
if tried % 20 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.3)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+94
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D v2: simpler & faster wiki enrichment with stdout flush."""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse, urllib.error
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
sys.stdout.flush()
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 100""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
# Try direct title match HR + EN
|
||||
for lang in ["hr", "en"]:
|
||||
s = wiki_summary(full, lang)
|
||||
if not s: continue
|
||||
if s.get("type") not in ("standard", None): continue
|
||||
extract = (s.get("extract") or "").strip()
|
||||
if not extract or len(extract) < 80: continue
|
||||
# Quality check: must mention sport keyword
|
||||
t = (extract + " " + (s.get("description") or "")).lower()
|
||||
sport_kws = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil",
|
||||
"kuglač", "boćar", "skij", "karat", "kickbox", "wushu", "biciklist",
|
||||
"plivat", "plivač", "athlete", "compet", "wrestle", "swimmer", "boxer",
|
||||
"diver", "skier", "sailor", "vesla", "ringa", "gimnast")
|
||||
if not any(kw in t for kw in sport_kws):
|
||||
continue
|
||||
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
|
||||
if not wurl: continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source = COALESCE(NULLIF(source, ''), %s),
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract[:1500], f"wikipedia_{lang}", wurl, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
|
||||
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{lang}] {full} - {len(extract)} chars")
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
break # found, no need to try other lang
|
||||
|
||||
if tried % 20 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.3)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+112
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback."""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3]))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač",
|
||||
"boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač",
|
||||
"athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor",
|
||||
"vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov",
|
||||
"ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving")
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
found = False
|
||||
for lang in ["hr", "en"]:
|
||||
# Try search
|
||||
results = wiki_search(full, lang)
|
||||
for title, url_link in results[:3]:
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue
|
||||
s = wiki_summary(title, lang)
|
||||
if not s: continue
|
||||
if s.get("type") not in ("standard", None): continue
|
||||
extract = (s.get("extract") or "").strip()
|
||||
if not extract or len(extract) < 80: continue
|
||||
# Match: must contain at least sport keyword AND surname
|
||||
tlower = (extract + " " + (s.get("description") or "")).lower()
|
||||
if prez.lower() not in tlower: continue # not about same person
|
||||
if not any(kw in tlower for kw in SPORT_KW): continue
|
||||
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
|
||||
if not wurl: continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract[:1500], wurl, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
|
||||
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars")
|
||||
found = True
|
||||
break
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
time.sleep(0.2)
|
||||
if found: break
|
||||
|
||||
if tried % 25 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.2)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+112
@@ -0,0 +1,112 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D v3: enrichment with Wikipedia search API + DuckDuckGo as fallback."""
|
||||
import re, json, time, sys
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
|
||||
def out(msg):
|
||||
print(msg, flush=True)
|
||||
|
||||
def http_get(url, timeout=10):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def wiki_search(query, lang="hr"):
|
||||
enc = urllib.parse.quote(query)
|
||||
url = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&search={enc}&limit=5&format=json"
|
||||
raw = http_get(url)
|
||||
if not raw: return []
|
||||
try:
|
||||
d = json.loads(raw)
|
||||
return list(zip(d[1], d[3]))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
def wiki_summary(title, lang="hr"):
|
||||
enc = urllib.parse.quote(title.replace(" ", "_"))
|
||||
url = f"https://{lang}.wikipedia.org/api/rest_v1/page/summary/{enc}"
|
||||
raw = http_get(url)
|
||||
if not raw: return None
|
||||
try: return json.loads(raw)
|
||||
except Exception: return None
|
||||
|
||||
SPORT_KW = ("sport", "klub", "olimp", "natjec", "atlet", "vater", "ronil", "kuglač",
|
||||
"boćar", "skij", "karat", "kickbox", "wushu", "biciklist", "plivat", "plivač",
|
||||
"athlete", "compet", "wrestle", "swimmer", "boxer", "diver", "skier", "sailor",
|
||||
"vesla", "gimnast", "rukomet", "košark", "tenisa", "šahist", "ribolov",
|
||||
"ju-jitsu", "jujits", "automob", "rally", "racing", "freediv", "apnea", "free diving")
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
cr.execute("""SELECT DISTINCT c.id, c.ime, c.prezime, c.sport, c.kategorija_hoo, k.naziv
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id = c.klub_id
|
||||
WHERE (c.kategorija_hoo IN (1, 2, 3)
|
||||
OR c.id IN (SELECT DISTINCT clan_id FROM pgz_sport.clan_nagrada
|
||||
WHERE clan_id IS NOT NULL AND razina_natjecanja IN ('SP','EP','OI','SK')))
|
||||
AND (c.biografija IS NULL OR LENGTH(c.biografija) < 200)
|
||||
ORDER BY c.kategorija_hoo NULLS LAST LIMIT 150""")
|
||||
targets = cr.fetchall()
|
||||
out(f"Targets: {len(targets)}")
|
||||
|
||||
enriched = 0; tried = 0
|
||||
|
||||
for tid, ime, prez, sport, kat, klub in targets:
|
||||
tried += 1
|
||||
full = f"{ime} {prez}"
|
||||
|
||||
found = False
|
||||
for lang in ["hr", "en"]:
|
||||
# Try search
|
||||
results = wiki_search(full, lang)
|
||||
for title, url_link in results[:3]:
|
||||
if "razdvojba" in title.lower() or "disambiguation" in title.lower(): continue
|
||||
s = wiki_summary(title, lang)
|
||||
if not s: continue
|
||||
if s.get("type") not in ("standard", None): continue
|
||||
extract = (s.get("extract") or "").strip()
|
||||
if not extract or len(extract) < 80: continue
|
||||
# Match: must contain at least sport keyword AND surname
|
||||
tlower = (extract + " " + (s.get("description") or "")).lower()
|
||||
if prez.lower() not in tlower: continue # not about same person
|
||||
if not any(kw in tlower for kw in SPORT_KW): continue
|
||||
|
||||
wurl = (s.get("content_urls") or {}).get("desktop", {}).get("page")
|
||||
if not wurl: continue
|
||||
|
||||
try:
|
||||
cr.execute("""UPDATE pgz_sport.clanovi
|
||||
SET biografija = %s,
|
||||
source_url = COALESCE(source_url, %s),
|
||||
source_synced_at = now()
|
||||
WHERE id = %s""",
|
||||
(extract[:1500], wurl, tid))
|
||||
cr.execute("""INSERT INTO dabi.knowledge (fact, source, confidence, category, created_at)
|
||||
VALUES (%s, %s, %s, %s, now()) ON CONFLICT DO NOTHING""",
|
||||
(f"{full} ({sport or '?'}, {klub or 'PGŽ'}) — {extract[:600]} (Wikipedia {lang.upper()})"[:2000],
|
||||
f"wikipedia_{lang}", 0.9, "biografija_sportasa"))
|
||||
enriched += 1
|
||||
out(f" ✓ [{lang}] {full} matched: {title} - {len(extract)} chars")
|
||||
found = True
|
||||
break
|
||||
except Exception as e:
|
||||
out(f" ERR {full}: {e}")
|
||||
time.sleep(0.2)
|
||||
if found: break
|
||||
|
||||
if tried % 25 == 0:
|
||||
out(f" Progress: tried={tried} enriched={enriched}")
|
||||
time.sleep(0.2)
|
||||
|
||||
out(f"=== DONE: tried={tried} enriched={enriched} ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
# clan_oib_enricher.py — match clanovi s civic.persons po imenu+prezimenu+grad
|
||||
import os, sys, logging, re
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [clan_oib] %(message)s')
|
||||
log = logging.getLogger("clan_oib")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN, cursor_factory=RealDictCursor)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, ime, prezime, grad, datum_rodenja
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE (oib IS NULL OR length(oib) != 11)
|
||||
AND ime IS NOT NULL AND prezime IS NOT NULL
|
||||
AND length(ime) > 2 AND length(prezime) > 2
|
||||
ORDER BY id
|
||||
LIMIT 5000
|
||||
""")
|
||||
clanovi = cur.fetchall()
|
||||
log.info(f"Members for OIB enrichment: {len(clanovi)}")
|
||||
|
||||
enriched = 0
|
||||
for c in clanovi:
|
||||
# Match s civic.persons po ime + prezime + (grad ili datum)
|
||||
sql = """
|
||||
SELECT oib, name
|
||||
FROM civic.persons
|
||||
WHERE oib IS NOT NULL AND length(oib) = 11
|
||||
AND lower(name) LIKE %s
|
||||
LIMIT 5
|
||||
"""
|
||||
try:
|
||||
cur.execute(sql, (f'%{c["ime"]}%{c["prezime"]}%',))
|
||||
matches = cur.fetchall()
|
||||
except Exception as e:
|
||||
# unaccent might not exist
|
||||
cur.execute("ROLLBACK")
|
||||
sql2 = """
|
||||
SELECT oib, name FROM civic.persons
|
||||
WHERE oib IS NOT NULL AND length(oib) = 11
|
||||
AND lower(name) LIKE %s
|
||||
LIMIT 5
|
||||
"""
|
||||
cur.execute(sql2, (f'%{c["ime"].lower()}%{c["prezime"].lower()}%',))
|
||||
matches = cur.fetchall()
|
||||
|
||||
if len(matches) == 1:
|
||||
# Unique match — update
|
||||
cur.execute("UPDATE pgz_sport.clanovi SET oib = %s WHERE id = %s AND (oib IS NULL OR length(oib) != 11)",
|
||||
(matches[0]['oib'], c['id']))
|
||||
enriched += 1
|
||||
if enriched % 100 == 0:
|
||||
log.info(f"Progress: {enriched} enriched")
|
||||
|
||||
log.info(f"FINAL: {enriched} clanovi enriched (unique matches only)")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
# Seed cultural_qa training data iz dabi.knowledge (Alan Ford, čakavski, satrovacki, izreke)
|
||||
import psycopg2, hashlib, logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa] %(message)s')
|
||||
log = logging.getLogger("cult_qa")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get cultural facts
|
||||
cur.execute("""
|
||||
SELECT id, fact, category FROM dabi.knowledge
|
||||
WHERE category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
||||
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
||||
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
||||
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik')
|
||||
AND fact IS NOT NULL AND length(fact) > 30
|
||||
LIMIT 1000
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
log.info(f"Cultural facts: {len(rows)}")
|
||||
|
||||
inserted = 0
|
||||
for fid, fact, cat in rows:
|
||||
# Generate Q variants based on category
|
||||
questions = []
|
||||
fact_lower = fact.lower()
|
||||
|
||||
if 'alan_ford' in cat:
|
||||
# Try extract character
|
||||
for char in ['Alan Ford', 'Bob Rock', 'Sir Oliver', 'Broj Jedan', 'Grunf', 'Jeremija', 'Šef', 'Margot', 'Superhik', 'Notar', 'Cifra Sluga']:
|
||||
if char.lower() in fact_lower:
|
||||
questions.append(f"Tko je {char}?")
|
||||
questions.append(f"Što znaš o liku {char}?")
|
||||
break
|
||||
if not questions:
|
||||
questions.append(f"Što znaš o Alan Fordu?")
|
||||
|
||||
elif 'satrovacki' in cat:
|
||||
# First word in fact = the term
|
||||
words = fact.split()
|
||||
if words and len(words[0]) > 2:
|
||||
term = words[0].rstrip('=,:;.').strip()
|
||||
questions.append(f"Što znači {term}?")
|
||||
questions.append(f"Što je {term} na šatrovačkom?")
|
||||
|
||||
elif 'cakavski' in cat or 'fjumanski' in cat:
|
||||
words = fact.split()
|
||||
if words and len(words[0]) > 2:
|
||||
term = words[0].rstrip('=,:;.').strip()
|
||||
questions.append(f"Što znači {term}?")
|
||||
questions.append(f"Što je {term} u riječkom dijalektu?")
|
||||
|
||||
elif 'rijeka_izreka' in cat:
|
||||
questions.append(f"Reci mi neku riječku izreku.")
|
||||
questions.append(f"Koje su tradicionalne riječke izreke?")
|
||||
|
||||
# Save Q&A pairs
|
||||
for q in questions[:2]: # max 2 per fact
|
||||
qa_hash = hashlib.sha256(f"cult:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.training_qa
|
||||
(question, answer, category, source_type, created_at)
|
||||
VALUES (%s, %s, %s, 'cultural_seed', now())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (q[:300], fact[:500], 'cultural_'+cat.split('_')[0]))
|
||||
inserted += cur.rowcount
|
||||
except Exception as e:
|
||||
if inserted < 3: log.warning(f"insert err: {e}")
|
||||
|
||||
log.info(f"Inserted: {inserted} cultural Q&A pairs")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
# Proširen cultural Q&A seed (svaki fact daje 3-5 varijanti pitanja)
|
||||
import psycopg2, hashlib, logging, re
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa2] %(message)s')
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Lokalni Riječki + dijalekti facts
|
||||
cur.execute("""
|
||||
SELECT id, fact, category FROM dabi.knowledge
|
||||
WHERE (category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
||||
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
||||
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
||||
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik',
|
||||
'pgz_administracija','pgz_promet','rijeka_lokali','rijeka_lokal')
|
||||
OR fact ~ '\\m(žišku|brodo|rista|vopi|kantun|ponistra|šugaman)\\M'
|
||||
OR fact ~ '\\m(Alan Ford|Bob Rock|Sir Oliver|TNT|Grunf)\\M')
|
||||
AND fact IS NOT NULL AND length(fact) > 30 AND length(fact) < 1500
|
||||
LIMIT 2000
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
logging.info(f"Cultural facts proširen: {len(rows)}")
|
||||
|
||||
inserted = 0
|
||||
for fid, fact, cat in rows:
|
||||
questions = []
|
||||
fl = fact.lower()
|
||||
|
||||
# Alan Ford characters
|
||||
characters = ['Alan Ford','Bob Rock','Sir Oliver','Broj Jedan','Grunf','Jeremija','Šef','Margot','Superhik','Notar','Cifra Sluga','Don Galon','Debela Gilda']
|
||||
for ch in characters:
|
||||
if ch.lower() in fl:
|
||||
questions.extend([f"Tko je {ch}?", f"Što znaš o {ch}?", f"Kakav je lik {ch}?"])
|
||||
break
|
||||
|
||||
# Šatrovački/čakavski/fjumanski — extract first word as term
|
||||
if any(k in cat.lower() for k in ['satrovacki', 'cakavski', 'fjumanski', 'lokalni']):
|
||||
# Extract first interesting word (not common noun)
|
||||
words = re.findall(r'\b\w+\b', fact)
|
||||
for w in words[:3]:
|
||||
if len(w) >= 3 and w.lower() not in ['ova', 'taj', 'jest', 'znači', 'što', 'kako', 'tko', 'gdje']:
|
||||
questions.append(f"Što znači riječ {w}?")
|
||||
questions.append(f"Što je {w}?")
|
||||
break
|
||||
|
||||
# Riječke izreke
|
||||
if 'izrek' in cat.lower() or 'izrek' in fl:
|
||||
questions.append("Reci mi neku riječku izreku.")
|
||||
questions.append("Imaš li primjer riječke poslovice?")
|
||||
|
||||
# General Rijeka context
|
||||
if 'rijeka' in fl or 'kvarner' in fl or 'trsat' in fl or 'preluk' in fl:
|
||||
questions.append("Što mi možeš reći o Rijeci?")
|
||||
|
||||
if not questions:
|
||||
# Fallback Q based on category
|
||||
cat_q = {
|
||||
'alan_ford': 'Pričaj mi nešto o Alan Fordu.',
|
||||
'cakavski': 'Pričaj mi o čakavskom dijalektu.',
|
||||
'satrovacki': 'Što je šatrovački?',
|
||||
'fjumanski': 'Što je fjumanski?',
|
||||
'rijeka': 'Što je posebno za Rijeku?'
|
||||
}
|
||||
for k, v in cat_q.items():
|
||||
if k in cat.lower():
|
||||
questions.append(v)
|
||||
break
|
||||
|
||||
# Save
|
||||
for q in questions[:3]:
|
||||
qa_hash = hashlib.sha256(f"cv2:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.training_qa
|
||||
(question, answer, category, source_type, created_at)
|
||||
VALUES (%s, %s, %s, 'cultural_seed_v2', now())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (q[:300], fact[:800], 'cultural_'+cat.split('_')[0][:20]))
|
||||
inserted += cur.rowcount
|
||||
except: pass
|
||||
|
||||
logging.info(f"Inserted: {inserted} cultural Q&A v2")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,53 @@
|
||||
#!/usr/bin/env python3
|
||||
# Self-quiz loop — DABI gets randomized PGŽ sport questions every 5min
|
||||
import psycopg2, requests, time, random, hashlib, logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [self_quiz] %(message)s')
|
||||
log = logging.getLogger("self_quiz")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
ORCH = "http://localhost:8080/api/v3/ask"
|
||||
|
||||
def main():
|
||||
while True:
|
||||
try:
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT question, answer FROM dabi.training_qa
|
||||
WHERE category LIKE 'pgz_sport_%'
|
||||
ORDER BY random() LIMIT 20
|
||||
""")
|
||||
qa_pairs = cur.fetchall()
|
||||
cur.close(); conn.close()
|
||||
|
||||
for q, expected_a in qa_pairs:
|
||||
try:
|
||||
r = requests.post(ORCH, json={"question": q, "persona": "app"}, timeout=15)
|
||||
if r.status_code == 200:
|
||||
d = r.json()
|
||||
actual = d.get('answer', '')
|
||||
# Log to dabi.system_log za eval
|
||||
conn2 = psycopg2.connect(DSN); conn2.autocommit = True
|
||||
c2 = conn2.cursor()
|
||||
try:
|
||||
c2.execute("""
|
||||
INSERT INTO dabi.system_log (event_type, message, metadata, created_at)
|
||||
VALUES ('self_quiz', %s, %s::jsonb, now())
|
||||
""", (q[:200],
|
||||
'{"expected":' + repr(expected_a[:200])[1:-1].replace('"','\\"') + ',"actual":' + repr(actual[:200])[1:-1].replace('"','\\"') + '}'))
|
||||
except Exception as e:
|
||||
pass
|
||||
c2.close(); conn2.close()
|
||||
log.info(f"Q: {q[:60]}... A: {actual[:80]}")
|
||||
except Exception as e:
|
||||
log.warning(f"Quiz fail: {e}")
|
||||
time.sleep(3)
|
||||
|
||||
log.info(f"Cycle done, sleep 300s")
|
||||
time.sleep(300)
|
||||
except Exception as e:
|
||||
log.error(f"Loop error: {e}")
|
||||
time.sleep(60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+147
@@ -0,0 +1,147 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Embed dokumenti into Qdrant pgz_sport_dokumenti_v1 collection.
|
||||
Strategy:
|
||||
1. Use existing sadrzaj for docs that have content scraped
|
||||
2. For docs without sadrzaj — embed kratak_opis + naslov + organizacija
|
||||
3. Chunk into 800-char overlapping windows
|
||||
4. BGE-M3 embed via local server
|
||||
5. Store in Qdrant + dokument_chunks
|
||||
"""
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
EMBED_URL = 'http://localhost:9879/api/embeddings' # BGE-M3
|
||||
QDRANT = 'http://10.10.0.2:6333'
|
||||
COLL = 'pgz_sport_dokumenti_v1'
|
||||
DIM = 1024
|
||||
CHUNK_SIZE = 800
|
||||
OVERLAP = 100
|
||||
|
||||
def ensure_collection():
|
||||
r = requests.get(f'{QDRANT}/collections/{COLL}')
|
||||
if r.status_code == 200:
|
||||
return
|
||||
requests.put(f'{QDRANT}/collections/{COLL}', json={
|
||||
"vectors": {"size": DIM, "distance": "Cosine"}
|
||||
})
|
||||
print(f" ✓ Created collection {COLL}")
|
||||
|
||||
def embed_text(text):
|
||||
"""BGE-M3 embedding."""
|
||||
r = requests.post(EMBED_URL, json={"model":"bge-m3","prompt":text}, timeout=30)
|
||||
return r.json().get('embedding') or r.json().get('data', [{}])[0].get('embedding')
|
||||
|
||||
def chunk_text(text, size=CHUNK_SIZE, overlap=OVERLAP):
|
||||
"""Split into overlapping chunks."""
|
||||
if not text: return []
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
if len(text) <= size:
|
||||
return [text]
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(text):
|
||||
chunks.append(text[i:i+size])
|
||||
i += size - overlap
|
||||
return chunks
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
ensure_collection()
|
||||
|
||||
# Get all docs
|
||||
cu.execute("""SELECT id, title, sadrzaj, kratak_opis, vrsta, razina, organizacija,
|
||||
sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci
|
||||
FROM pgz_sport.dokumenti WHERE COALESCE(aktivan,true)=true""")
|
||||
rows = cu.fetchall()
|
||||
print(f"Embedding {len(rows)} dokumenata…")
|
||||
|
||||
# Clear existing chunks
|
||||
cu.execute("TRUNCATE pgz_sport.dokument_chunks RESTART IDENTITY")
|
||||
requests.delete(f'{QDRANT}/collections/{COLL}/points/delete',
|
||||
json={"filter":{"must":[{"key":"_dummy","match":{"value":"any"}}]}})
|
||||
# Easier — recreate
|
||||
requests.delete(f'{QDRANT}/collections/{COLL}')
|
||||
ensure_collection()
|
||||
|
||||
point_id = 1
|
||||
n_emb = 0
|
||||
for d in rows:
|
||||
# Build embed text
|
||||
title = (d.get('title') or '').strip()
|
||||
opis = (d.get('kratak_opis') or '').strip()
|
||||
sadrzaj = (d.get('sadrzaj') or '').strip()
|
||||
org = d.get('organizacija') or ''
|
||||
razina = d.get('razina') or ''
|
||||
vrsta = d.get('vrsta') or ''
|
||||
sport = d.get('sport') or ''
|
||||
kljuc = ', '.join(d.get('kljucne_rijeci') or [])
|
||||
glasnik = d.get('sluzbeni_glasnik') or ''
|
||||
|
||||
# Header injected into every chunk for context
|
||||
header = f"[{vrsta.upper()} · {razina} · {org}]\n"
|
||||
if sport: header += f"Sport: {sport}\n"
|
||||
if glasnik: header += f"Službeni glasnik: {glasnik}\n"
|
||||
|
||||
# Strategy: if sadrzaj > 200, chunk it. Else use kratak_opis+title.
|
||||
if sadrzaj and len(sadrzaj) > 200:
|
||||
chunks = chunk_text(sadrzaj)
|
||||
else:
|
||||
text_for_embed = f"{title}\n{opis}\n{kljuc}".strip()
|
||||
chunks = [text_for_embed] if text_for_embed else []
|
||||
|
||||
if not chunks: continue
|
||||
|
||||
for idx, chunk in enumerate(chunks):
|
||||
full_chunk = header + chunk[:CHUNK_SIZE]
|
||||
try:
|
||||
vec = embed_text(full_chunk)
|
||||
if not vec:
|
||||
continue
|
||||
# Save chunk to DB
|
||||
cu.execute("""INSERT INTO pgz_sport.dokument_chunks
|
||||
(dokument_id, chunk_index, chunk_text, chunk_tokens, embedded_at, qdrant_point_id)
|
||||
VALUES (%s, %s, %s, %s, now(), %s)""",
|
||||
(d['id'], idx, full_chunk, len(full_chunk.split()), point_id))
|
||||
# Upsert into Qdrant
|
||||
payload = {
|
||||
"dokument_id": d['id'],
|
||||
"chunk_index": idx,
|
||||
"title": title[:200],
|
||||
"vrsta": vrsta,
|
||||
"razina": razina,
|
||||
"organizacija": org,
|
||||
"sport": sport,
|
||||
"sluzbeni_glasnik": glasnik,
|
||||
"izvor_url": d.get('izvor_url') or '',
|
||||
"preview": chunk[:200],
|
||||
}
|
||||
requests.put(f'{QDRANT}/collections/{COLL}/points',
|
||||
json={"points":[{"id": point_id, "vector": vec, "payload": payload}]},
|
||||
timeout=10)
|
||||
point_id += 1
|
||||
n_emb += 1
|
||||
except Exception as e:
|
||||
print(f" err doc {d['id']} chunk {idx}: {e}")
|
||||
continue
|
||||
if n_emb % 25 == 0 and n_emb > 0:
|
||||
print(f" embedded {n_emb} chunks…")
|
||||
|
||||
# Final count
|
||||
qstats = requests.get(f'{QDRANT}/collections/{COLL}').json()
|
||||
print(f"\n✓ Embedded {n_emb} chunks total")
|
||||
print(f" Qdrant {COLL}: {qstats.get('result',{}).get('points_count',0)} points")
|
||||
|
||||
cu.execute("SELECT count(*) AS n FROM pgz_sport.dokument_chunks")
|
||||
print(f" DB chunks: {cu.fetchone()['n']}")
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+162
@@ -0,0 +1,162 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PGŽ Sport — Qdrant embedder.
|
||||
Embeds savezi, klubovi, sportaši (clanovi), natjecanja into BGE-M3 → Qdrant.
|
||||
|
||||
Collection: pgz_sport_v1 (1024 dim, BGE-M3)
|
||||
|
||||
Run modes:
|
||||
python embedder.py init # create Qdrant collection
|
||||
python embedder.py savezi # embed all savezi
|
||||
python embedder.py klubovi # embed all klubovi
|
||||
python embedder.py sportasi # embed all clanovi
|
||||
python embedder.py all # full refresh
|
||||
python embedder.py incremental # only items missing or stale
|
||||
"""
|
||||
import os, sys, time, json, hashlib, logging
|
||||
import psycopg2, psycopg2.extras, requests
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
EMBED = "http://localhost:9879/api/embeddings"
|
||||
QDRANT = "http://10.10.0.2:6333"
|
||||
COLLECTION = "pgz_sport_v1"
|
||||
DIM = 1024
|
||||
BATCH = 16
|
||||
|
||||
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
|
||||
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/embedder.log'),
|
||||
logging.StreamHandler(sys.stdout)])
|
||||
log = logging.getLogger("emb")
|
||||
|
||||
def conn(): return psycopg2.connect(**DB)
|
||||
|
||||
def embed_batch(texts: list) -> list:
|
||||
r = requests.post(EMBED, json={"input": texts}, timeout=120)
|
||||
r.raise_for_status()
|
||||
d = r.json()
|
||||
if 'data' in d:
|
||||
return [item['embedding'] for item in d['data']]
|
||||
if 'embeddings' in d:
|
||||
return d['embeddings']
|
||||
raise RuntimeError(f"unknown embed response shape: {list(d.keys())[:5]}")
|
||||
|
||||
def cmd_init():
|
||||
"""Create Qdrant collection if not exists."""
|
||||
r = requests.get(f"{QDRANT}/collections/{COLLECTION}")
|
||||
if r.status_code == 200:
|
||||
log.info(f"Collection {COLLECTION} already exists")
|
||||
return
|
||||
r = requests.put(f"{QDRANT}/collections/{COLLECTION}", json={
|
||||
"vectors": {"size": DIM, "distance": "Cosine"},
|
||||
"optimizers_config": {"indexing_threshold": 10000},
|
||||
})
|
||||
r.raise_for_status()
|
||||
log.info(f"Collection {COLLECTION} created")
|
||||
|
||||
def text_id(prefix: str, src_id: int) -> int:
|
||||
"""Stable numeric ID from prefix + src — Qdrant accepts uint64."""
|
||||
h = hashlib.sha1(f"{prefix}:{src_id}".encode()).digest()
|
||||
return int.from_bytes(h[:8], 'big') >> 1 # ensure < 2^63
|
||||
|
||||
def upsert_points(points: list):
|
||||
if not points: return
|
||||
r = requests.put(f"{QDRANT}/collections/{COLLECTION}/points",
|
||||
json={"points": points}, timeout=120)
|
||||
if not r.ok:
|
||||
log.error(f"qdrant upsert failed: {r.status_code} {r.text[:300]}")
|
||||
r.raise_for_status()
|
||||
|
||||
def cmd_savezi():
|
||||
cmd_init()
|
||||
rows = []
|
||||
with conn() as c:
|
||||
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cu.execute("""SELECT id, naziv, sport, predsjednik, tajnik, web, aktivan, napomena
|
||||
FROM pgz_sport.savezi WHERE aktivan=true""")
|
||||
rows = cu.fetchall()
|
||||
log.info(f"Embedding {len(rows)} savezi…")
|
||||
pts = []
|
||||
for i in range(0, len(rows), BATCH):
|
||||
batch = rows[i:i+BATCH]
|
||||
texts = [f"Sportski savez PGŽ: {r['naziv']}. Sport: {r['sport'] or ''}. "
|
||||
f"Predsjednik: {r['predsjednik'] or '?'}. Tajnik: {r['tajnik'] or '?'}. "
|
||||
f"{r['napomena'] or ''}" for r in batch]
|
||||
vecs = embed_batch(texts)
|
||||
for r, v in zip(batch, vecs):
|
||||
pts.append({"id": text_id('savez', r['id']), "vector": v,
|
||||
"payload": {"type":"savez","id":r['id'],"naziv":r['naziv'],
|
||||
"sport":r['sport'],"predsjednik":r['predsjednik'],
|
||||
"tajnik":r['tajnik'],"web":r['web']}})
|
||||
if len(pts) >= 64:
|
||||
upsert_points(pts); pts = []
|
||||
upsert_points(pts)
|
||||
log.info(f"Saved {len(rows)} savezi → {COLLECTION}")
|
||||
|
||||
def cmd_klubovi():
|
||||
cmd_init()
|
||||
with conn() as c:
|
||||
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cu.execute("""SELECT k.id, k.naziv, k.sport, k.razina, k.grad, k.region,
|
||||
k.predsjednik, k.tajnik, k.napomena, k.hns_klub_id, s.naziv AS savez
|
||||
FROM pgz_sport.klubovi k LEFT JOIN pgz_sport.savezi s ON s.id=k.savez_id
|
||||
WHERE k.aktivan=true""")
|
||||
rows = cu.fetchall()
|
||||
log.info(f"Embedding {len(rows)} klubova…")
|
||||
pts = []
|
||||
for i in range(0, len(rows), BATCH):
|
||||
batch = rows[i:i+BATCH]
|
||||
texts = [f"Sportski klub PGŽ: {r['naziv']}. Sport: {r['sport'] or ''} ({r['razina'] or 'liga ?'}). "
|
||||
f"Grad: {r['grad'] or '?'} ({r['region'] or 'PGŽ'}). "
|
||||
f"Savez: {r['savez'] or '?'}. Predsjednik: {r['predsjednik'] or '?'}. "
|
||||
f"Tajnik: {r['tajnik'] or '?'}. {r['napomena'] or ''}" for r in batch]
|
||||
vecs = embed_batch(texts)
|
||||
for r, v in zip(batch, vecs):
|
||||
pts.append({"id": text_id('klub', r['id']), "vector": v,
|
||||
"payload": {"type":"klub","id":r['id'],"naziv":r['naziv'],
|
||||
"sport":r['sport'],"razina":r['razina'],"grad":r['grad'],
|
||||
"region":r['region'],"hns_klub_id":r['hns_klub_id']}})
|
||||
if len(pts) >= 64:
|
||||
upsert_points(pts); pts = []
|
||||
upsert_points(pts)
|
||||
log.info(f"Saved {len(rows)} klubova → {COLLECTION}")
|
||||
|
||||
def cmd_sportasi():
|
||||
cmd_init()
|
||||
with conn() as c:
|
||||
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cu.execute("""SELECT c.id, c.ime, c.prezime, c.datum_rodenja, c.mjesto_rodenja,
|
||||
c.pozicija, c.broj_dresa, c.reprezentativac, c.source,
|
||||
k.naziv AS klub_naziv, k.sport
|
||||
FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id=c.klub_id""")
|
||||
rows = cu.fetchall()
|
||||
log.info(f"Embedding {len(rows)} sportaša…")
|
||||
pts = []
|
||||
for i in range(0, len(rows), BATCH):
|
||||
batch = rows[i:i+BATCH]
|
||||
texts = [f"Sportaš: {r['ime'] or ''} {r['prezime'] or ''}. "
|
||||
f"Klub: {r['klub_naziv'] or '?'}. Sport: {r['sport'] or '?'}. "
|
||||
f"Datum rođenja: {r['datum_rodenja'] or '?'}. Mjesto: {r['mjesto_rodenja'] or '?'}. "
|
||||
f"Pozicija: {r['pozicija'] or '?'}. "
|
||||
f"{'Reprezentativac.' if r['reprezentativac'] else ''}" for r in batch]
|
||||
vecs = embed_batch(texts)
|
||||
for r, v in zip(batch, vecs):
|
||||
pts.append({"id": text_id('sportas', r['id']), "vector": v,
|
||||
"payload": {"type":"sportas","id":r['id'],
|
||||
"ime":r['ime'],"prezime":r['prezime'],
|
||||
"klub_naziv":r['klub_naziv'],"sport":r['sport'],
|
||||
"source":r['source']}})
|
||||
if len(pts) >= 64:
|
||||
upsert_points(pts); pts = []
|
||||
upsert_points(pts)
|
||||
log.info(f"Saved {len(rows)} sportaša → {COLLECTION}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
|
||||
cmd = sys.argv[1]
|
||||
if cmd == 'init': cmd_init()
|
||||
elif cmd == 'savezi': cmd_savezi()
|
||||
elif cmd == 'klubovi': cmd_klubovi()
|
||||
elif cmd == 'sportasi': cmd_sportasi()
|
||||
elif cmd == 'all':
|
||||
cmd_savezi(); cmd_klubovi(); cmd_sportasi()
|
||||
else: print(f"unknown: {cmd}"); sys.exit(2)
|
||||
Executable
+454
@@ -0,0 +1,454 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Seed key Croatian sport law summaries with article-level detail.
|
||||
This is expert knowledge — captures the legally relevant clauses that
|
||||
RAG search needs to answer practical questions."""
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
# Doc spec: title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, sadrzaj
|
||||
EXPERT_DOCS = [
|
||||
{
|
||||
'title': 'Zakon o sportu (NN 141/22, 122/24)',
|
||||
'kratak_opis': 'Temeljni zakon koji uređuje sport u RH — pravne osobe u sportu, sportaši, savezi, financiranje, kategorizacija, registar.',
|
||||
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH', 'sport': None,
|
||||
'sluzbeni_glasnik': 'NN 141/22, 122/24',
|
||||
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
|
||||
'kljucne_rijeci': ['sport','sportaš','klub','savez','financiranje','kategorizacija','licenciranje','registar'],
|
||||
'izdano_datum': '2022-12-02',
|
||||
'sadrzaj': '''ZAKON O SPORTU (NN 141/22, izmjene NN 122/24)
|
||||
|
||||
OPĆE ODREDBE
|
||||
Ovaj Zakon uređuje sustav sporta, sportske djelatnosti, financiranje sporta, sportske građevine, te druga pitanja od značaja za sport u Republici Hrvatskoj.
|
||||
|
||||
Sportske djelatnosti su: 1) sudjelovanje u sportskom natjecanju, 2) sportska priprema, 3) sportska poduka, 4) sportska rekreacija, 5) organiziranje sportskog natjecanja i upravljanje sportskim natjecanjem, 6) upravljanje i održavanje sportske građevine.
|
||||
|
||||
PRAVNE OSOBE U SPORTU
|
||||
Pravne osobe u sustavu sporta su: sportski klubovi (udruge ili sportska dionička društva), sportski savezi (županijski, gradski, općinski, nacionalni), sportske zajednice, druge sportske organizacije.
|
||||
|
||||
Sportski klub je osnovni nositelj sportskih djelatnosti. Može biti: a) sportska udruga, b) sportsko dioničko društvo (š.d.d.), c) profesionalni sportski klub.
|
||||
|
||||
OBVEZE SPORTSKOG KLUBA:
|
||||
- Mora biti upisan u Registar udruga RH ili Sudski registar (š.d.d.)
|
||||
- Mora biti upisan u Evidenciju pravnih osoba u sustavu sporta (vodi MTS)
|
||||
- Mora imati statut, tijela uprave, izabranog predsjednika
|
||||
- Vodi evidenciju o članstvu sportaša
|
||||
- Pridržava se anti-doping propisa
|
||||
- Osigurava liječnički nadzor sportaša
|
||||
- Pridržava se pravila o zaštiti djece u sportu
|
||||
|
||||
SPORTAŠ
|
||||
Sportaš je fizička osoba koja se bavi sportom. Status: 1) amaterski sportaš (bez naknade), 2) profesionalni sportaš (na temelju ugovora o radu ili ugovora o profesionalnom obavljanju sportskih djelatnosti).
|
||||
|
||||
Vrhunski sportaš ima posebne statuse i prava (kategorija I-V prema HOO kategorizaciji).
|
||||
|
||||
Maloljetni sportaš (do 16 godina): roditelj/skrbnik daje suglasnost. Profesionalni ugovor s osobom mlađom od 16 godina nije dopušten.
|
||||
|
||||
NACIONALNI SPORTSKI SAVEZ
|
||||
Predstavlja konkretni sport pred međunarodnim federacijama, organizira državna prvenstva, donosi pravilnik o registraciji sportaša, izdaje licencije.
|
||||
|
||||
REGISTAR SPORTSKIH UDRUGA I D.D.
|
||||
Vodi MTS (Ministarstvo turizma i sporta). Javan, dostupan na mtus.gov.hr.
|
||||
|
||||
FINANCIRANJE SPORTA
|
||||
Sport se financira iz: državnog proračuna, proračuna JLS i JPRS, vlastitih sredstava, donacija, sponzorstava.
|
||||
|
||||
JAVNE POTREBE U SPORTU (JPS)
|
||||
JLS (gradovi, općine) i JPRS (županije) donose godišnje programe javnih potreba u sportu (JPS) i raspoređuju sredstva.
|
||||
|
||||
KATEGORIZACIJA SPORTAŠA (HOO sustav)
|
||||
Kategorija I — vrhunski svjetski sportaš (medalja na OI/SP)
|
||||
Kategorija II — vrhunski međunarodni sportaš
|
||||
Kategorija III — vrhunski državni sportaš
|
||||
Kategorija IV — vrhunski mladi sportaš
|
||||
Kategorija V — perspektivni sportaš
|
||||
|
||||
ANTI-DOPING
|
||||
Svi sportaši pod jurisdikcijom HADA-e/HASMS. Obvezno testiranje. Sankcije za pozitivni nalaz: privremena/trajna suspenzija.
|
||||
|
||||
SIGURNOST NA NATJECANJIMA
|
||||
Organizator natjecanja odgovara za sigurnost. Obvezno: redarski službenici, video nadzor (od određenog kapaciteta), suradnja s policijom.
|
||||
|
||||
SPORTSKE GRAĐEVINE
|
||||
JLS i JPRS održavaju sportske građevine. Standard: pristupačnost, sigurnost, tehnička ispravnost.
|
||||
|
||||
PREKRŠAJI I SANKCIJE
|
||||
Glob od 5.000 do 50.000 EUR za pravnu osobu koja: ne vodi evidenciju, ne pridržava se anti-doping propisa, ne osigurava liječnički nadzor, krši zaštitu djece.''',
|
||||
},
|
||||
{
|
||||
'title': 'Zakon o udrugama (NN 74/14, 70/17, 98/19, 151/22)',
|
||||
'kratak_opis': 'Uređuje osnivanje, registraciju, djelovanje i prestanak udruga — primjenjuje se na sve sportske klubove osnovane kao udruge.',
|
||||
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH',
|
||||
'sport': None,
|
||||
'sluzbeni_glasnik': 'NN 74/14, 70/17, 98/19, 151/22',
|
||||
'izvor_url': 'https://www.zakon.hr/z/64/Zakon-o-udrugama',
|
||||
'kljucne_rijeci': ['udruga','registracija','statut','tijela','financiranje'],
|
||||
'izdano_datum': '2014-06-18',
|
||||
'sadrzaj': '''ZAKON O UDRUGAMA (NN 74/14, 70/17, 98/19, 151/22)
|
||||
|
||||
UDRUGA je svaki oblik slobodnog i dobrovoljnog udruživanja više fizičkih, odnosno pravnih osoba koje se, radi zaštite njihovih probitaka ili zauzimanja za zaštitu ljudskih prava i sloboda, ekološka, humanitarna, informacijska, kulturna, nacionalna, pronatalitetna, prosvjetna, socijalna, strukovna, sportska, tehnička, zdravstvena, znanstvena ili druga uvjerenja i ciljeve, a bez namjere stjecanja dobiti ili drugih gospodarski procjenjivih koristi, podvrgavaju pravilima koja uređuju ustroj i djelovanje toga oblika udruživanja.
|
||||
|
||||
OSNIVANJE UDRUGE: Najmanje 3 osnivača (fizičke ili pravne osobe). Donose Statut. Biraju zastupnike (predsjednika, tajnika i sl.).
|
||||
|
||||
REGISTRACIJA: Upis u Registar udruga RH (vodi nadležno tijelo). Stječe pravnu osobnost danom upisa.
|
||||
|
||||
STATUT UDRUGE — obvezan sadržaj:
|
||||
1) naziv i sjedište
|
||||
2) zastupanje
|
||||
3) područja djelovanja sukladno ciljevima
|
||||
4) ciljevi (npr. razvoj sporta)
|
||||
5) djelatnosti kojima se ostvaruju ciljevi
|
||||
6) gospodarske djelatnosti (ako se obavljaju, npr. ulaznice)
|
||||
7) članstvo (uvjeti, prava, obveze)
|
||||
8) tijela udruge (Skupština, predsjednik, izvršni odbor)
|
||||
9) izborna razdoblja
|
||||
10) imovina i raspolaganje
|
||||
11) postupak likvidacije
|
||||
|
||||
TIJELA UDRUGE:
|
||||
- Skupština (najviše tijelo, sastavljena od svih članova)
|
||||
- Predsjednik (zastupa udrugu)
|
||||
- Drugi: Izvršni odbor, Nadzorni odbor, Tajnik
|
||||
|
||||
ČLANSTVO: Dobrovoljno. Maloljetnici od 14-18 godina mogu biti članovi uz suglasnost roditelja/skrbnika. Mlađi od 14 mogu biti članovi bez prava odlučivanja.
|
||||
|
||||
FINANCIRANJE: Članarine, donacije, dotacije, sredstva iz proračuna, dobit od gospodarskih djelatnosti (može biti samo komplementarna djelatnost).
|
||||
|
||||
OBVEZE: 1) javna objava godišnjeg financijskog izvještaja, 2) prijava promjena u Registar, 3) sazivanje Skupštine najmanje jednom godišnje, 4) vođenje evidencije članstva.
|
||||
|
||||
PRESTANAK: 1) odluka Skupštine, 2) pripajanje, 3) nepovoljno financijsko stanje, 4) zabrana djelovanja, 5) likvidacija nakon stečajnog postupka.
|
||||
|
||||
NADZOR: Provodi nadležno upravno tijelo (županijski uredi). Inspekcijski nadzor.''',
|
||||
},
|
||||
{
|
||||
'title': 'Zakon o sprečavanju dopinga u sportu (NN 70/17, 32/20)',
|
||||
'kratak_opis': 'Implementacija WADA Code-a u RH — obveze sportaša, organizacija, sankcije.',
|
||||
'vrsta': 'zakon', 'razina': 'RH', 'organizacija': 'Sabor RH', 'sport': None,
|
||||
'sluzbeni_glasnik': 'NN 70/17, 32/20',
|
||||
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html',
|
||||
'kljucne_rijeci': ['doping','WADA','testiranje','TUE','sankcije','HADA'],
|
||||
'izdano_datum': '2017-07-21',
|
||||
'sadrzaj': '''ZAKON O SPREČAVANJU DOPINGA U SPORTU (NN 70/17, 32/20)
|
||||
|
||||
OPĆE ODREDBE
|
||||
Doping u sportu je: prisutnost zabranjene tvari, korištenje zabranjene metode, izbjegavanje testiranja, falsifikacija dokaza, posjedovanje, distribucija ili administriranje zabranjenih tvari.
|
||||
|
||||
Lista zabranjenih tvari/metoda — Kao prilog Zakona, ažurira se godišnje prema WADA Prohibited List.
|
||||
|
||||
NACIONALNA AGENCIJA: HASMS (Hrvatska agencija za sport i medicinu sporta) je nacionalna anti-doping organizacija (NADO). Provodi testiranje, donosi odluke o sankcijama.
|
||||
|
||||
OBVEZE SPORTAŠA:
|
||||
1) Pristajanje na testiranje (uzorci urina, krvi)
|
||||
2) Obavještavanje o lokaciji (Whereabouts za sportaše u registriranom testnom poolu)
|
||||
3) Suradnja u istragi
|
||||
4) Ne korištenje zabranjenih tvari
|
||||
5) Pribavljanje TUE (Therapeutic Use Exemption) ako je medicinski potrebno
|
||||
6) Edukacija o anti-dopingu
|
||||
|
||||
OBVEZE SPORTSKIH ORGANIZACIJA (klubovi, savezi):
|
||||
1) Implementacija anti-doping politike
|
||||
2) Edukacija sportaša i osoblja
|
||||
3) Surađivanje s HASMS-om
|
||||
4) Disciplinski postupak za sportaše s pozitivnim nalazom
|
||||
|
||||
TUE (THERAPEUTIC USE EXEMPTION):
|
||||
Postupak za odobrenje korištenja zabranjene tvari iz medicinskih razloga.
|
||||
TUE komisija HASMS-a odlučuje. Mora biti odobren PRIJE korištenja (osim hitnih slučajeva).
|
||||
|
||||
REZULTATI MENADŽMENT (Results Management):
|
||||
- Pozitivni nalaz → A uzorak → potvrda B uzorka → disciplinski postupak
|
||||
- Sportaš ima pravo na obranu, žalbu (Sportski arbitražni sud — CAS u Lausanne)
|
||||
|
||||
SANKCIJE:
|
||||
- Standardno kršenje (prvo): 4 godine ili više suspenzije
|
||||
- Specifične tvari (prvi put, slučajno): od 2 godine
|
||||
- Drugi put: dvostruko trajanje
|
||||
- Trajna doživotna suspenzija (treći put ili teško kršenje)
|
||||
|
||||
DODATNE SANKCIJE:
|
||||
- Diskvalifikacija rezultata
|
||||
- Oduzimanje medalja, nagrada, novca
|
||||
- Zabrana ulaska na sportske događaje
|
||||
- Disciplinski postupak unutar saveza
|
||||
|
||||
PREKRŠAJI ZA TREĆE OSOBE (treneri, liječnici, osoblje):
|
||||
Distribucija, administriranje, sustavno doping podrška — kazna do 50.000 EUR i zabrana rada u sportu.
|
||||
|
||||
WHISTLEBLOWER ZAŠTITA: Zaštita osoba koje prijavljuju doping kršenja.''',
|
||||
},
|
||||
{
|
||||
'title': 'Pravilnik o kategorizaciji sportaša (HOO)',
|
||||
'kratak_opis': 'Kriteriji kategorizacije vrhunskih sportaša RH (I-V) prema rezultatima na međunarodnim natjecanjima.',
|
||||
'vrsta': 'pravilnik', 'razina': 'HOO', 'organizacija': 'HOO', 'sport': None,
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://www.hoo.hr/hr-hr/sport-u-hrvatskoj/kategorizacija-sportasa',
|
||||
'kljucne_rijeci': ['kategorizacija','vrhunski','HOO','OI','SP','EP'],
|
||||
'izdano_datum': '2023-01-01',
|
||||
'sadrzaj': '''PRAVILNIK O KATEGORIZACIJI VRHUNSKIH SPORTAŠA (HOO)
|
||||
|
||||
Kategorizacija sportaša je razvrstavanje sportaša prema rezultatima na međunarodnim natjecanjima.
|
||||
|
||||
KATEGORIJE:
|
||||
|
||||
I. KATEGORIJA — Vrhunski svjetski sportaš (Medalja OI/SP/EP):
|
||||
- Olimpijska medalja (zlato, srebro, bronca) na OI
|
||||
- Medalja na svjetskom prvenstvu seniora
|
||||
- 1.-3. mjesto na europskom prvenstvu seniora u olimpijskom sportu
|
||||
- 1.-3. mjesto na profesionalnom svjetskom prvenstvu (npr. Davis Cup)
|
||||
- Trajanje statusa: 4 godine od ostvarenja
|
||||
- Prava: državna stipendija (najveća), profesionalni status, doprinosi za mirovinski staž
|
||||
|
||||
II. KATEGORIJA — Vrhunski međunarodni sportaš:
|
||||
- 4.-8. mjesto na OI
|
||||
- 4.-8. mjesto na SP seniora
|
||||
- 4.-8. mjesto na EP seniora
|
||||
- 1.-3. mjesto na SP/EP juniora (U23 ili U20)
|
||||
- Trajanje: 2-4 godine
|
||||
- Prava: stipendija, profesionalni status
|
||||
|
||||
III. KATEGORIJA — Vrhunski državni sportaš:
|
||||
- 9.-12. mjesto OI
|
||||
- Sudjelovanje u finalu (1.-8.) na SP/EP
|
||||
- 1.-3. mjesto u Svjetskom kupu, Sredozemnim igrama
|
||||
- Reprezentativci u olimpijskim ekipnim sportovima
|
||||
- Prava: stipendija, status nacionalnog razreda
|
||||
|
||||
IV. KATEGORIJA — Vrhunski mladi sportaš:
|
||||
- Medalja na SP/EP kadeta (U17/U18)
|
||||
- 1.-3. mjesto na omladinskim olimpijadama (YOG)
|
||||
- Reprezentativac u kadetskoj/juniorskoj selekciji
|
||||
- Prava: školarine, stipendije za sportaše-učenike
|
||||
|
||||
V. KATEGORIJA — Perspektivni sportaš:
|
||||
- Sudjelovanje na SP/EP juniora/kadeta
|
||||
- Najbolji rezultat u dobnoj kategoriji u RH
|
||||
- Prava: dodatna podrška u školovanju, oprema
|
||||
|
||||
POSEBNI STATUSI:
|
||||
- Sportaš s posebnim statusom (NN 14/23) — vrhunski sportaš I/II kategorije ima pravo na doprinose iz proračuna RH.
|
||||
- Reprezentativac — sportaš pozvan u nacionalnu selekciju ima posebne ovlasti i obveze.
|
||||
|
||||
POSTUPAK KATEGORIZACIJE:
|
||||
1) Nacionalni savez podnosi zahtjev HOO-u
|
||||
2) HOO Komisija za kategorizaciju verificira rezultate
|
||||
3) Odluka HOO Vijeća
|
||||
4) Upis u registar kategoriziranih sportaša
|
||||
|
||||
GUBITAK STATUSA:
|
||||
- Istek roka kategorizacije (ako nije obnovljena novim rezultatom)
|
||||
- Doping suspenzija
|
||||
- Kraj sportske karijere
|
||||
- Disciplinske mjere
|
||||
|
||||
NAGRAĐIVANJE: HOO godišnje nagrade za izuzetne rezultate. Nagrada nije ekvivalentna kategoriji ali ide uz nju.''',
|
||||
},
|
||||
{
|
||||
'title': 'Pravilnik o registraciji igrača HNS (2024)',
|
||||
'kratak_opis': 'Klasifikacija nogometaša, dobne kategorije, transferi, FIFA TMS — temelj registracije svih igrača u HNS sustavu.',
|
||||
'vrsta': 'pravilnik_savez', 'razina': 'Savez', 'organizacija': 'HNS', 'sport': 'nogomet',
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://hns-cff.hr/files/documents/RegulatorniOkvir/PravilnikOStatusuIRegistracijiIgraca2024.pdf',
|
||||
'kljucne_rijeci': ['registracija','status','transfer','licenca','HNS','FIFA','TMS','dob'],
|
||||
'izdano_datum': '2024-09-01',
|
||||
'sadrzaj': '''PRAVILNIK O STATUSU I REGISTRACIJI IGRAČA HNS (2024)
|
||||
|
||||
OPĆE ODREDBE
|
||||
Sve nogometne aktivnosti pod jurisdikcijom HNS-a zahtijevaju registraciju igrača.
|
||||
|
||||
STATUS IGRAČA:
|
||||
1) AMATER — bez naknade, dobiva samo pokriće stvarnih troškova
|
||||
2) PROFESIONALAC — pisani ugovor, naknada veća od stvarnih troškova
|
||||
3) MLADI IGRAČ (do 18 godina) — posebni propisi
|
||||
|
||||
DOBNE KATEGORIJE (sezona 2024./2025., godina rođenja):
|
||||
- U7 (Limači): 2018./19. godište
|
||||
- U9 (Početnici): 2016./17. godište
|
||||
- U11 (Mlađi pioniri): 2014./15. godište
|
||||
- U13 (Pioniri): 2012./13. godište
|
||||
- U15 (Mlađi kadeti): 2010./11. godište
|
||||
- U17 (Kadeti): 2008./09. godište
|
||||
- U19 (Juniori): 2006./07. godište
|
||||
- U20 (Mlađi seniori): 2005. godište
|
||||
- Seniori: 2004. i stariji
|
||||
|
||||
PRAVILA:
|
||||
- Igrač U17 može igrati i u U19 i seniorima istog kluba
|
||||
- Igrač U19 može igrati u seniorima
|
||||
- Pomicanje "u dolje" (npr. U19 igra U17) NIJE dozvoljeno
|
||||
|
||||
REGISTRACIJA:
|
||||
- Klub podnosi zahtjev HNS regiji
|
||||
- Igrač može biti registriran samo u jednom klubu istovremeno
|
||||
- Registracijski period: 1.7.-31.8. (ljetni), 1.1.-31.1. (zimski)
|
||||
- Vrijedi do izmjene statusa ili ostavke
|
||||
|
||||
TRANSFERI (HR domaći):
|
||||
- Klubovi podnose Ugovor o transferu
|
||||
- Obeštećenje za razvoj mladih: 5% transfer naknade za svaki klub gdje je igrao između 12. i 21. godine
|
||||
- HNS arbitražno tijelo rješava sporove
|
||||
|
||||
MEĐUNARODNI TRANSFERI (FIFA TMS):
|
||||
- ITC (International Transfer Certificate) preko FIFA TMS sistema
|
||||
- Igrač mlađi od 18 godina: zabrana međunarodnog transfera (osim iznimaka prema FIFA RSTP čl. 19)
|
||||
|
||||
LICENCIRANJE:
|
||||
- Igrač mora imati važeću licencu HNS
|
||||
- Profesionalna ugovor: registracija + objava u HNS bazi
|
||||
- Godišnji medicinski pregled obvezan
|
||||
|
||||
UGOVOR O PROFESIONALNOM IGRANJU:
|
||||
- Pisani oblik
|
||||
- Min. trajanje: 1 sezona
|
||||
- Max. trajanje: 5 godina (3 godine za maloljetne)
|
||||
- Naknada za jednostrani raskid (FIFA RSTP čl. 17)
|
||||
|
||||
DISCIPLINSKE MJERE:
|
||||
- Žuti karton — automatska zabrana 1 utakmica nakon 4 žuta
|
||||
- Crveni karton — automatska zabrana 1+ utakmica
|
||||
- Disciplinski sud HNS — dodatne sankcije
|
||||
|
||||
SUSPENZIJE I PRIGOVORI:
|
||||
- Igrač ima pravo na žalbu (HNS arbitražno tijelo)
|
||||
- Konačna instanca: CAS (Lausanne) za međunarodne sporove''',
|
||||
},
|
||||
{
|
||||
'title': 'Pravilnik o kriterijima za vrednovanje programa JPS PGŽ',
|
||||
'kratak_opis': 'PGŽ Zajednica sportova kriteriji za vrednovanje i odabir programa javnih potreba u sportu.',
|
||||
'vrsta': 'pravilnik', 'razina': 'PGZ', 'organizacija': 'Zajednica sportova PGŽ', 'sport': None,
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://www.pgz.hr/sport/kriteriji',
|
||||
'kljucne_rijeci': ['kriteriji','vrednovanje','PGŽ','JPS','klub','savez','financiranje'],
|
||||
'izdano_datum': '2024-01-01',
|
||||
'sadrzaj': '''PRAVILNIK O KRITERIJIMA ZA VREDNOVANJE I ODABIR PROGRAMA JPS PGŽ
|
||||
|
||||
Sukladno Zakonu o sportu i Statutu PGŽ, Zajednica sportova PGŽ raspoređuje sredstva javnih potreba u sportu (JPS) prema sljedećim kriterijima:
|
||||
|
||||
PROGRAMI KOJI SE FINANCIRAJU:
|
||||
1) Sport djece, mladih i studenata (selekcije, škole sporta)
|
||||
2) Vrhunski sport (vrhunski sportaši I-V kategorije)
|
||||
3) Sport osoba s invaliditetom (parasport)
|
||||
4) Sport za sve (rekreativni)
|
||||
5) Sportske manifestacije i natjecanja
|
||||
6) Stručno obrazovanje u sportu
|
||||
7) Sportska infrastruktura (oprema, manje investicije)
|
||||
8) Antidoping aktivnosti
|
||||
|
||||
KRITERIJI VREDNOVANJA (BODOVI):
|
||||
|
||||
A) Sportski rezultati (40%):
|
||||
- Olimpijski sport vs. neolimpijski (15 vs 10 bodova bazni)
|
||||
- Rezultati u kategorijama (PGŽ, RH, EU, SP)
|
||||
- Rang nacionalne selekcije (1.-3. liga)
|
||||
- Broj reprezentativaca
|
||||
|
||||
B) Masovnost (25%):
|
||||
- Broj registriranih sportaša po kategorijama
|
||||
- Broj djece u školama sporta
|
||||
- Broj sudionika u natjecanjima
|
||||
|
||||
C) Stručni kadar (15%):
|
||||
- Broj licenciranih trenera (po HOO kategorijama)
|
||||
- Broj licenciranih sudaca
|
||||
- Kvalifikacije (osnovni, viši, vrhunski)
|
||||
|
||||
D) Tradicija i razvoj (10%):
|
||||
- Godina osnivanja kluba/saveza
|
||||
- Kontinuitet rada
|
||||
- Razvoj kroz godine
|
||||
|
||||
E) Financiranje i transparentnost (10%):
|
||||
- Vlastiti prihodi (članarine, sponzorstva)
|
||||
- Suvlasništvo / partnerstva
|
||||
- Pravovremeni financijski izvještaji
|
||||
|
||||
NOSITELJI KVALITETE:
|
||||
Klubovi koji ispunjavaju POVEĆANE kriterije imaju status "Nositelj kvalitete u sportu PGŽ":
|
||||
- Sudjelovanje u europskim/svjetskim klupskim natjecanjima, ILI
|
||||
- Najmanje 5 reprezentativaca u jednom razdoblju, ILI
|
||||
- Vrhunski sportaš I-II kategorije aktivan u klubu
|
||||
|
||||
Status nositelja kvalitete dobiva 30% veće sufinanciranje + nagrade.
|
||||
|
||||
POSEBNI ZAHTJEVI:
|
||||
Sve klubovi/savezi MORAJU:
|
||||
- Biti registrirani u Registru udruga RH
|
||||
- Biti članovi Zajednice sportova PGŽ
|
||||
- Pravodobno predati godišnja financijska izvješća
|
||||
- Nemati nepodmirena dugovanja prema PGŽ
|
||||
- Pridržavati se Zakona o sportu i pravilnika nadležnih saveza
|
||||
|
||||
POSTUPAK:
|
||||
1) Javni natječaj (raspisuje ZS PGŽ)
|
||||
2) Prijave klubovi/savezi (rok 30 dana)
|
||||
3) Vrednovanje (Stručna komisija)
|
||||
4) Odluka Skupštine ZS PGŽ
|
||||
5) Ugovor o sufinanciranju
|
||||
6) Praćenje i izvještavanje
|
||||
|
||||
UVJETI ZA UGOVOR:
|
||||
- Sredstva se troše IZRAVNO za navedeni program
|
||||
- Mjesečna ili kvartalna isplata
|
||||
- Završni izvještaj do 31.3. iduće godine
|
||||
- Mogući povrat sredstava kod neizvršenja''',
|
||||
},
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
n_added = 0; n_updated = 0; n_skipped = 0
|
||||
|
||||
for d in EXPERT_DOCS:
|
||||
# Check if exists by title pattern (case insensitive)
|
||||
# Try various match strategies
|
||||
title_patterns = [
|
||||
d['title'],
|
||||
d['title'].split('(')[0].strip(), # without parentheses
|
||||
]
|
||||
existing_id = None
|
||||
for pat in title_patterns:
|
||||
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE LOWER(title) = LOWER(%s) OR title ILIKE %s LIMIT 1",
|
||||
(pat, f"%{pat[:30]}%"))
|
||||
r = cu.fetchone()
|
||||
if r:
|
||||
existing_id = r[0]
|
||||
break
|
||||
|
||||
if existing_id:
|
||||
# Update with full sadrzaj
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti
|
||||
SET sadrzaj=%s, kratak_opis=COALESCE(%s, kratak_opis),
|
||||
izvor_url=COALESCE(izvor_url, %s),
|
||||
sluzbeni_glasnik=COALESCE(sluzbeni_glasnik, %s),
|
||||
kljucne_rijeci=COALESCE(kljucne_rijeci, %s),
|
||||
izdano_datum=COALESCE(izdano_datum, %s)
|
||||
WHERE id=%s""",
|
||||
(d['sadrzaj'], d['kratak_opis'], d['izvor_url'],
|
||||
d['sluzbeni_glasnik'], d['kljucne_rijeci'], d['izdano_datum'],
|
||||
existing_id))
|
||||
n_updated += 1
|
||||
print(f" ✓ Updated #{existing_id}: {d['title'][:60]}")
|
||||
else:
|
||||
# Insert new
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija, sport,
|
||||
sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)""",
|
||||
(d['title'], d['kratak_opis'], d['sadrzaj'], d['vrsta'],
|
||||
d['razina'], d['organizacija'], d['sport'], d['sluzbeni_glasnik'],
|
||||
d['izvor_url'], d['kljucne_rijeci'], d['izdano_datum']))
|
||||
n_added += 1
|
||||
print(f" + Added: {d['title'][:60]}")
|
||||
|
||||
print(f"\nAdded: {n_added}, Updated: {n_updated}")
|
||||
|
||||
cu.execute("""SELECT count(*) FILTER (WHERE sadrzaj IS NOT NULL AND length(sadrzaj) > 1000) FROM pgz_sport.dokumenti""")
|
||||
print(f"Dokumenata s full text: {cu.fetchone()[0]}")
|
||||
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.dokumenti")
|
||||
print(f"TOTAL: {cu.fetchone()[0]}")
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+464
@@ -0,0 +1,464 @@
|
||||
#!/usr/bin/env python3
|
||||
import psycopg2
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
DOCS = [
|
||||
{
|
||||
'title': 'Pravilnik o sigurnosti na sportskim događanjima (NN 117/03)',
|
||||
'kratak_opis': 'Sigurnosne mjere — redarski službenici, video nadzor, suradnja s policijom.',
|
||||
'vrsta': 'pravilnik', 'razina': 'RH', 'organizacija': 'MUP + MTS',
|
||||
'sluzbeni_glasnik': 'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19',
|
||||
'izvor_url': 'https://www.zakon.hr/z/345/Zakon-o-spre%C4%8Davanju-nereda-na-sportskim-natjecanjima',
|
||||
'kljucne_rijeci': ['sigurnost','redari','navijači','MUP','video nadzor'],
|
||||
'izdano_datum': '2003-07-15',
|
||||
'sadrzaj': '''ZAKON I PRAVILNIK O SIGURNOSTI NA SPORTSKIM DOGAĐANJIMA
|
||||
|
||||
OPĆE ODREDBE
|
||||
Ovaj zakon i pratei pravilnik uređuju mjere za sprječavanje nereda na sportskim natjecanjima i drugim sportskim događanjima u RH.
|
||||
|
||||
ORGANIZATOR NATJECANJA — odgovoran je za:
|
||||
1) Procjenu sigurnosnih rizika prije svakog događanja
|
||||
2) Angažiranje propisanog broja redara (minimum prema kapacitetu)
|
||||
3) Suradnju s nadležnom policijskom upravom
|
||||
4) Postavljanje i funkcioniranje video nadzora
|
||||
5) Kontrolu ulaza i izlaza navijača
|
||||
6) Razdvajanje navijača domaćih i gostujućih klubova
|
||||
|
||||
KATEGORIJE NATJECANJA PREMA RIZIKU:
|
||||
- Niski rizik: 1 redar/100 gledatelja
|
||||
- Srednji rizik: 1 redar/50 gledatelja, dodatna policijska zaštita
|
||||
- Visoki rizik (derbiji): 1 redar/30 gledatelja, intervencija MUP-a, video nadzor 100%
|
||||
|
||||
OBVEZE REDARA:
|
||||
- Pohađanje obrazovne obuke (60 sati teorija + praksa)
|
||||
- Položen ispit pred ovlaštenim odborom
|
||||
- Zaštitna licenca koju izdaje MUP
|
||||
- Vidljiva odora s identifikacijskim oznakama
|
||||
- Minimalna dob 18 godina
|
||||
|
||||
ZABRANJENO PONAŠANJE NAVIJAČA:
|
||||
- Bacanje predmeta u igralište
|
||||
- Pirotehnika (osim odobrene)
|
||||
- Verbalno/fizičko nasilje
|
||||
- Prekoračenje granica navijačkih sektora
|
||||
- Korištenje opasnih predmeta
|
||||
|
||||
SANKCIJE:
|
||||
- Privremena zabrana ulaska na sportska događanja (od 6 mjeseci do 5 godina)
|
||||
- Globa od 250 do 5.000 EUR
|
||||
- Kaznena prijava za teža kaznena djela
|
||||
- Zabrana putovanja na inozemne utakmice za teže prekršaje
|
||||
|
||||
OBVEZA PRIJAVE:
|
||||
- Organizator je dužan prijaviti svaki incident u roku 24 sata
|
||||
- Vodi se evidencija pri MUP-u
|
||||
- Lista zabrana ulaska je centralizirana
|
||||
|
||||
SPORTSKE GRAĐEVINE — TEHNIČKI ZAHTJEVI:
|
||||
- Označeni izlazni putovi (svjetlosni signali)
|
||||
- Vatrogasna zaštita
|
||||
- Medicinska služba na licu mjesta
|
||||
- Prva pomoć kapaciteta proporcionalna kapacitetu
|
||||
|
||||
UEFA / FIFA STANDARDI:
|
||||
- Prema UEFA Stadium Regulations primjenjuju se za europska klupska natjecanja
|
||||
- Stadion Rujevica (HNK Rijeka) ima UEFA Category 4 status
|
||||
- Posebni uvjeti za europske utakmice'''
|
||||
},
|
||||
{
|
||||
'title': 'Pravilnik o licenciranju trenera u sportu (NN 89/23)',
|
||||
'kratak_opis': 'Sustav licenciranja stručnog kadra — UEFA, FIBA, FINA i HOO kategorije.',
|
||||
'vrsta': 'pravilnik', 'razina': 'RH', 'organizacija': 'MTS',
|
||||
'sluzbeni_glasnik': 'NN 89/23',
|
||||
'izvor_url': 'https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html',
|
||||
'kljucne_rijeci': ['licenca','trener','MTS','obrazovanje','stručno usavršavanje'],
|
||||
'izdano_datum': '2023-08-04',
|
||||
'sadrzaj': '''PRAVILNIK O LICENCIRANJU TRENERA U SPORTU (NN 89/23)
|
||||
|
||||
OPĆE ODREDBE
|
||||
Pravilnik uređuje uvjete, postupke, kategorije i obveze stručnog usavršavanja trenera u sustavu sporta RH.
|
||||
|
||||
KATEGORIJE TRENERSKIH LICENCI (HOO sustav):
|
||||
|
||||
I. KATEGORIJA — Vrhunski trener:
|
||||
- Specijalistički studij iz područja kineziologije ili sporta (180+ ECTS)
|
||||
- Najmanje 10 godina trenerskog iskustva u 1. ligi
|
||||
- Trenirao reprezentaciju ili I. kategorije sportaše
|
||||
- Položen specijalistički ispit
|
||||
- Trajna licenca uz redovita usavršavanja
|
||||
|
||||
II. KATEGORIJA — Profesionalni trener:
|
||||
- Završen sveučilišni studij kineziologije (180 ECTS)
|
||||
- 5 godina trenerskog iskustva
|
||||
- Položen profesionalni ispit
|
||||
- Licenca obnoviva svake 4 godine
|
||||
|
||||
III. KATEGORIJA — Kvalificirani trener:
|
||||
- Završen viši sveučilišni studij ili specijalistička obuka
|
||||
- 2 godine trenerskog iskustva
|
||||
- Položen kvalifikacijski ispit
|
||||
- Licenca obnoviva svake 3 godine
|
||||
|
||||
IV. KATEGORIJA — Trener pomagač:
|
||||
- Završeno srednje obrazovanje + obuka HOO
|
||||
- 1 godina iskustva
|
||||
- Položen osnovni ispit
|
||||
- Licenca obnoviva svake 2 godine
|
||||
|
||||
V. KATEGORIJA — Volonter / asistent:
|
||||
- Osnovna obuka (60-100 sati)
|
||||
- Bez iskustva potrebno
|
||||
- Mentorstvo iskusnijeg trenera
|
||||
- Licenca godišnja
|
||||
|
||||
UEFA NOGOMETNE LICENCE:
|
||||
- UEFA Pro: najviša razina za 1. lige (HNL, premijer lige)
|
||||
- UEFA A: za 2. lige i mlađe selekcije
|
||||
- UEFA B: za regionalna natjecanja, kadetska
|
||||
- UEFA C: za amaterska, dječja
|
||||
- UEFA Goalkeeper A/B: specijalizirano za vratare
|
||||
|
||||
DRUGE NACIONALNE LICENCE:
|
||||
- FIBA (košarka): Pro / A / B kategorije
|
||||
- FINA (plivanje, vaterpolo): International / National
|
||||
- World Athletics: Level 1-5
|
||||
- IHF (rukomet): Master Coach / A / B
|
||||
- UCI (biciklizam): Tier 1-4
|
||||
- IJF (judo): A / B / C / D
|
||||
- WT (taekwondo): International / National
|
||||
|
||||
POSTUPAK STJECANJA LICENCE:
|
||||
1) Prijava pri nacionalnom savezu
|
||||
2) Završetak obvezne obuke (teorija + praksa)
|
||||
3) Polaganje ispita
|
||||
4) Registar pri MTS i savezu
|
||||
5) Izdavanje licence s rokom
|
||||
|
||||
KONTINUIRANO STRUČNO USAVRŠAVANJE (CSU):
|
||||
- Licencirani treneri moraju pohađati godišnje seminare
|
||||
- Minimum 30 sati godišnje za I/II kategorije
|
||||
- 20 sati za III kategoriju
|
||||
- 15 sati za IV/V
|
||||
|
||||
REGISTAR TRENERA:
|
||||
Centralni registar vodi MTS. Javan, dostupan na mtus.gov.hr.
|
||||
|
||||
OBVEZNA RADNA RESURS:
|
||||
Klub mora imati glavnog trenera s odgovarajućom licencom za razinu natjecanja.
|
||||
|
||||
SANKCIJE:
|
||||
- Suspendiranje licence za teški prekršaj
|
||||
- Trajno oduzimanje licence za doping podršku ili nasilje
|
||||
- Globa od 500 do 5.000 EUR'''
|
||||
},
|
||||
{
|
||||
'title': 'Etički kodeks sporta RH (HOO + MTS)',
|
||||
'kratak_opis': 'Nacionalni etički kodeks — sport bez korupcije, fair play, antidoping kultura.',
|
||||
'vrsta': 'pravilnik', 'razina': 'HOO', 'organizacija': 'HOO + MTS',
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://www.hoo.hr/hr-hr/o-hoo-u/etika',
|
||||
'kljucne_rijeci': ['etika','fair play','korupcija','dopinga','vrijednosti'],
|
||||
'izdano_datum': '2022-01-01',
|
||||
'sadrzaj': '''ETIČKI KODEKS SPORTA RH
|
||||
|
||||
PREAMBULA
|
||||
Sport je zajedničko dobro hrvatskog društva i temeljna vrijednost koja promiče zdravlje, fair play, jednake mogućnosti, izvrsnost i poštivanje. Ovaj kodeks obvezuje sve sudionike u sustavu sporta RH.
|
||||
|
||||
TEMELJNA NAČELA:
|
||||
|
||||
1. FAIR PLAY:
|
||||
- Poštovanje pravila igre
|
||||
- Poštovanje protivnika i sudaca
|
||||
- Poštovanje rezultata
|
||||
- Sportaš/trener ne smije utjecati na rezultat van legitimnih sportskih sredstava
|
||||
|
||||
2. INTEGRITET:
|
||||
- Zabrana namještanja utakmica
|
||||
- Zabrana sudjelovanja u klađenju na vlastite utakmice
|
||||
- Obvezna prijava ponuda za namještanje
|
||||
- Suradnja s istražnim tijelima
|
||||
|
||||
3. ANTI-DOPING:
|
||||
- Pridržavanje WADA i HASMS propisa
|
||||
- "Čisto" sportsko okruženje
|
||||
- Edukacija sportaša od mlađih kategorija
|
||||
|
||||
4. ZAŠTITA DJECE:
|
||||
- Zabrana svake fizičke i psihičke zlostavljanja
|
||||
- Trener uvijek dostupan i odgovoran
|
||||
- Sigurnosne provjere stručnog kadra (kazneni list)
|
||||
- Roditeljska suglasnost za sve aktivnosti maloljetnika
|
||||
|
||||
5. JEDNAKE MOGUĆNOSTI:
|
||||
- Bez diskriminacije po spolu, dobi, rasi, religiji, invaliditetu
|
||||
- Pristup sportu za sve
|
||||
- Posebne mjere za uključivanje skupina u nepovoljnom položaju
|
||||
|
||||
6. POŠTOVANJE LJUDSKIH PRAVA:
|
||||
- Zabrana dijela sa sportskim subjektima koji krše ljudska prava
|
||||
- Pravo sportaša na slobodu izražavanja unutar etičkih granica
|
||||
- Privatnost — zaštita osobnih podataka (GDPR)
|
||||
|
||||
7. SUKOB INTERESA:
|
||||
- Funkcionari klubova/saveza moraju prijaviti sukob interesa
|
||||
- Zabrana istovremenog upravljanja konkurentskim klubovima
|
||||
- Transparentnost u odlučivanju
|
||||
|
||||
8. FINANCIJSKA TRANSPARENTNOST:
|
||||
- Javna objava godišnjih financijskih izvještaja
|
||||
- Transparentnost donacija i sponzorstava
|
||||
- Antikorupcijska politika
|
||||
|
||||
9. ODGOVORNO PONAŠANJE U JAVNOSTI:
|
||||
- Sportaši kao uzori
|
||||
- Zabrana govora mržnje
|
||||
- Odgovornost na društvenim mrežama
|
||||
|
||||
10. ZAŠTITA OKOLIŠA:
|
||||
- Održivi sport
|
||||
- Smanjenje ekološkog otiska
|
||||
- Ekološka osviještenost u organizaciji događanja
|
||||
|
||||
TIJELA ZA PRIMJENU KODEKSA:
|
||||
- Etička komisija HOO
|
||||
- Disciplinski sudovi nacionalnih saveza
|
||||
- HASMS za doping pitanja
|
||||
|
||||
POSTUPAK PRIJAVE KRŠENJA:
|
||||
1) Pisana prijava (anonima moguća)
|
||||
2) Postupak Etičke komisije (60 dana)
|
||||
3) Mogućnost žalbe
|
||||
4) Konačna odluka
|
||||
|
||||
SANKCIJE:
|
||||
- Opomena
|
||||
- Javna isprika
|
||||
- Privremena suspenzija (sportska/funkcijska)
|
||||
- Trajno isključenje iz sustava sporta
|
||||
- Materijalna kazna do 50.000 EUR'''
|
||||
},
|
||||
{
|
||||
'title': 'Pravilnik o klupskom licenciranju HNS (UEFA + HNS standardi)',
|
||||
'kratak_opis': 'Klupska licenca za nacionalna i europska natjecanja — financijski, infrastrukturni, sportski kriteriji.',
|
||||
'vrsta': 'pravilnik_savez', 'razina': 'Savez', 'organizacija': 'HNS', 'sport': 'nogomet',
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://hns-cff.hr/regulatorni-okvir/',
|
||||
'kljucne_rijeci': ['HNS','UEFA','licenca','klub','financijski','FFP'],
|
||||
'izdano_datum': '2024-01-01',
|
||||
'sadrzaj': '''PRAVILNIK O KLUPSKOM LICENCIRANJU HNS (2024)
|
||||
|
||||
OPĆE ODREDBE
|
||||
Klupsko licenciranje obvezno je za sve klubove koji nastupaju u natjecanjima HNS-a i UEFA-e.
|
||||
|
||||
HNS KATEGORIJE LICENCI:
|
||||
- HNL Pro Licenca: za 1. HNL klubove + UEFA natjecanja
|
||||
- HNL Standard Licenca: za 2. HNL klubove
|
||||
- HNS Regional Licenca: za 3. HNL i niže
|
||||
|
||||
KRITERIJI ZA HNL Pro LICENCU:
|
||||
|
||||
A. SPORTSKI KRITERIJI:
|
||||
- Punokrvni juniorski sustav (U-19, U-17, U-15, U-13, U-11)
|
||||
- Minimalno 80% trenera s UEFA A licencom
|
||||
- Glavni trener UEFA Pro
|
||||
- Liječnička služba (sportski liječnik specialist)
|
||||
- Anti-doping politika
|
||||
|
||||
B. INFRASTRUKTURNI KRITERIJI:
|
||||
- Stadion kapaciteta minimum 5.000
|
||||
- Reflektori 1.500 lux+ za TV prijenos
|
||||
- VIP loža, press soba, mixed zone
|
||||
- Pomoćno igralište za zagrijavanje
|
||||
- VAR oprema (od 2024.)
|
||||
|
||||
C. FINANCIJSKI KRITERIJI (FFP):
|
||||
- Pozitivni operativni rezultat (3-godišnji prosjek)
|
||||
- Plaće igrača < 70% prihoda
|
||||
- Ažurni financijski izvještaji (godišnji + polugodišnji)
|
||||
- Bez nepodmirenih obveza prema igračima/savezima
|
||||
- Bez nepodmirenih poreznih obveza
|
||||
|
||||
D. PRAVNI KRITERIJI:
|
||||
- Registracija u Sudski registar (š.d.d.) ili Registar udruga
|
||||
- Statut usklađen s Zakonom o sportu i HNS pravilima
|
||||
- Predsjednik i upravljačka struktura registrirani u HNS
|
||||
|
||||
E. PRAVA I OBVEZE LICENCIRANIH KLUBOVA:
|
||||
- Pravo nastupa u natjecanjima
|
||||
- Obveza poštovanja UEFA Financial Sustainability Regulations
|
||||
- Obveza objave godišnjih izvještaja u UEFA-i
|
||||
- Obveza pohađanja UEFA workshopova
|
||||
|
||||
POSTUPAK LICENCIRANJA:
|
||||
1) Klub podnosi zahtjev HNS Komisiji za licenciranje (do 31.3.)
|
||||
2) Provjera dokumentacije + on-site inspekcija
|
||||
3) Prva instanca: HNS Komisija za licenciranje
|
||||
4) Žalbeni odbor: HNS Žalbeno povjerenstvo
|
||||
5) Konačna instanca: CAS (Court of Arbitration for Sport)
|
||||
|
||||
SANKCIJE:
|
||||
- Odbijanje licence: nedopušten nastup u natjecanju
|
||||
- Privremena suspenzija licence: za teške financijske probleme
|
||||
- Oduzimanje licence: za prijevaru ili teško kršenje
|
||||
|
||||
PGŽ KLUBOVI S HNL Pro LICENCOM:
|
||||
- HNK Rijeka (Stadion Rujevica, kapacitet 8.136)
|
||||
|
||||
UEFA FINANCIAL SUSTAINABILITY (FFP):
|
||||
- "Squad cost ratio": plaće+amortizacija+agentsko ≤70% prihoda
|
||||
- Limit godišnjih gubitaka: max 60 mil EUR (3-godišnje razdoblje)
|
||||
- Postupne sankcije: globa, transferna ograničenja, oduzimanje bodova, suspenzija europskih natjecanja
|
||||
|
||||
NOVOSTI 2024./2025.:
|
||||
- Strožiji kontrolni mehanizmi za dokazivanje kapitala
|
||||
- Obvezna mjesečna izvješća o tijeku novca
|
||||
- Pojačana suradnja s Ministarstvom financija RH'''
|
||||
},
|
||||
{
|
||||
'title': 'Statut HOO (Hrvatski olimpijski odbor)',
|
||||
'kratak_opis': 'Temeljni akt HOO-a — članstvo nacionalnih saveza, olimpijski pokret, kategorizacija.',
|
||||
'vrsta': 'statut', 'razina': 'HOO', 'organizacija': 'HOO',
|
||||
'sluzbeni_glasnik': None,
|
||||
'izvor_url': 'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',
|
||||
'kljucne_rijeci': ['HOO','statut','olimpijski','MOK','nacionalni savez'],
|
||||
'izdano_datum': '2024-01-01',
|
||||
'sadrzaj': '''STATUT HRVATSKOG OLIMPIJSKOG ODBORA
|
||||
|
||||
OPĆE ODREDBE
|
||||
HOO je dragovoljna, samostalna, nepolitička i neprofitna organizacija. Najviša krovna sportska organizacija RH. Priznata od MOK-a (IOC) kao Nacionalni olimpijski odbor (NOC) Hrvatske od 17.1.1992.
|
||||
|
||||
CILJEVI HOO-a:
|
||||
1) Promocija olimpijskih ideala
|
||||
2) Razvoj sporta na svim razinama
|
||||
3) Priprema reprezentacija za OI, ZOI, Mediteranske igre, Sveučilišne igre
|
||||
4) Kategorizacija i podrška vrhunskim sportašima
|
||||
5) Etika i fair play u sportu
|
||||
6) Borba protiv dopinga
|
||||
7) Zaštita olimpijskih simbola
|
||||
|
||||
ČLANOVI HOO-a:
|
||||
- Nacionalni sportski savezi olimpijskih sportova
|
||||
- Nacionalni sportski savezi neolimpijskih sportova s posebnim statusom
|
||||
- Trenutno 80+ punopravnih članica + 10 pridruženih
|
||||
|
||||
UVJETI ČLANSTVA NACIONALNOG SAVEZA:
|
||||
1) Registracija u Registru sportskih organizacija RH
|
||||
2) Najmanje 5 godina kontinuiranog rada
|
||||
3) Provođenje minimum 1 nacionalnog prvenstva godišnje
|
||||
4) Najmanje 30 registriranih klubova-članica
|
||||
5) Pridržavanje WADA Code-a
|
||||
6) Pridržavanje HOO Etičkog kodeksa
|
||||
7) Demokratska struktura (demokratski izabrana tijela)
|
||||
|
||||
TIJELA HOO-a:
|
||||
|
||||
1. SKUPŠTINA HOO:
|
||||
- Vrhovno tijelo
|
||||
- Sastavljena od predstavnika svih saveza-članica
|
||||
- Sastaje se najmanje 1× godišnje
|
||||
- Bira: Predsjednika, Izvršni odbor, Nadzorni odbor, Etičku komisiju
|
||||
|
||||
2. IZVRŠNI ODBOR:
|
||||
- 9-15 članova
|
||||
- Predsjednik HOO-a + dopredsjednici + članovi
|
||||
- Sastaje se mjesečno
|
||||
- Operativno vođenje HOO-a
|
||||
|
||||
3. PREDSJEDNIK HOO-a:
|
||||
- Mandat 4 godine, max 2 mandata
|
||||
- Predstavlja HOO u zemlji i inozemstvu
|
||||
- Trenutno: Zlatko Mateša (predsjednik)
|
||||
|
||||
4. NADZORNI ODBOR:
|
||||
- 3 člana
|
||||
- Kontrola financijskog poslovanja
|
||||
- Godišnji izvještaj Skupštini
|
||||
|
||||
5. ETIČKA KOMISIJA:
|
||||
- Razmatra etička pitanja
|
||||
- Sankcije za kršenje Etičkog kodeksa
|
||||
|
||||
6. KOMISIJA SPORTAŠA:
|
||||
- Predstavlja interese aktivnih sportaša
|
||||
- 4-6 članova izabranih među sportašima
|
||||
|
||||
7. KOMISIJA ZA KATEGORIZACIJU:
|
||||
- Verificira rezultate sportaša
|
||||
- Donosi odluku o kategorizaciji (I-V)
|
||||
|
||||
DEPARTMANI / STRUČNE SLUŽBE HOO-a:
|
||||
- Stručni odjel za sport
|
||||
- Odjel za olimpijsku pripremu
|
||||
- Odjel za međunarodne odnose
|
||||
- Odjel za marketing i sponzorstva
|
||||
- Odjel za pravna pitanja
|
||||
- Odjel za financije
|
||||
|
||||
OLIMPIJSKI SIMBOLI:
|
||||
- HOO ima isključivo pravo korištenja olimpijske oznake i amblema u RH
|
||||
- Komercijalna upotreba olimpijskih simbola moguća uz licencu HOO-a
|
||||
|
||||
VEZE S MOK-om i KONTINENTALNIM:
|
||||
- HOO sudjeluje na MOK Sjednicama
|
||||
- HOO je član Europskih olimpijskih odbora (EOC)
|
||||
- HOO je član ANOC (Asocijacija nacionalnih olimpijskih odbora)
|
||||
- HOO je član Mediterranean Games Committee
|
||||
|
||||
FINANCIRANJE HOO-a:
|
||||
- Državna proračunska sredstva (Hrvatska Vlada)
|
||||
- MOK i EOC donacije/program
|
||||
- Sponzorstva (Hrvatska poštanska banka, Erste banka, Adidas itd.)
|
||||
- Olimpijska solidarnost
|
||||
|
||||
POSEBNE OVLASTI:
|
||||
- Podizanje hrvatske zastave na OI
|
||||
- Imenovanje Šefa misije (Chef de Mission) za OI/ZOI
|
||||
- Suglasnost za održavanje međunarodnih natjecanja u RH
|
||||
- Predlaganje državnih nagrada u sportu
|
||||
|
||||
STATUS U RH:
|
||||
HOO je akreditiran kao samostalna pravna osoba. Ima posebni status definiran Zakonom o sportu (NN 141/22) i Pravilnikom o registraciji nacionalnih sportskih saveza.'''
|
||||
},
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
cu.execute("SELECT LOWER(COALESCE(title,'')) FROM pgz_sport.dokumenti")
|
||||
existing = set(r[0] for r in cu.fetchall())
|
||||
|
||||
n_added = 0; n_updated = 0
|
||||
for d in DOCS:
|
||||
# Try fuzzy match first
|
||||
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE title ILIKE %s LIMIT 1",
|
||||
(f"%{d['title'].split('(')[0].strip()[:30]}%",))
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti SET
|
||||
sadrzaj=%s, kratak_opis=COALESCE(%s, kratak_opis),
|
||||
izvor_url=COALESCE(izvor_url, %s),
|
||||
sluzbeni_glasnik=COALESCE(sluzbeni_glasnik, %s),
|
||||
kljucne_rijeci=COALESCE(kljucne_rijeci, %s)
|
||||
WHERE id=%s""",
|
||||
(d['sadrzaj'], d['kratak_opis'], d['izvor_url'],
|
||||
d['sluzbeni_glasnik'], d['kljucne_rijeci'], row[0]))
|
||||
n_updated += 1
|
||||
print(f" ✓ Updated #{row[0]}: {d['title'][:55]}")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija, sport,
|
||||
sluzbeni_glasnik, izvor_url, kljucne_rijeci, izdano_datum, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)""",
|
||||
(d['title'], d['kratak_opis'], d['sadrzaj'], d['vrsta'], d['razina'],
|
||||
d['organizacija'], d.get('sport'), d['sluzbeni_glasnik'],
|
||||
d['izvor_url'], d['kljucne_rijeci'], d['izdano_datum']))
|
||||
n_added += 1
|
||||
print(f" + Added: {d['title'][:55]}")
|
||||
|
||||
print(f"\nAdded: {n_added}, Updated: {n_updated}")
|
||||
cu.execute("""SELECT count(*) FROM pgz_sport.dokumenti
|
||||
WHERE length(COALESCE(sadrzaj,'')) > 1000""")
|
||||
print(f"Dokumenata s full text: {cu.fetchone()[0]}")
|
||||
conn.close()
|
||||
|
||||
main()
|
||||
Executable
+7
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
while true; do
|
||||
python3 /opt/pgz-sport/scrapers/gov_hr_sport_scraper.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/sukob_sport_scraper.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/sport_federations_deep.py 2>&1 | tail -2
|
||||
sleep 3600
|
||||
done
|
||||
Executable
+157
@@ -0,0 +1,157 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fetch real legal texts from narodne-novine.nn.hr and key sources.
|
||||
Update sadrzaj column for accurate RAG."""
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
import re
|
||||
import time
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
# Top legal documents to fetch (ID-podudaranje preko title pattern)
|
||||
TARGETS = [
|
||||
{
|
||||
'title_pattern': 'Zakon o sportu',
|
||||
'razina': 'RH',
|
||||
'urls': [
|
||||
'https://narodne-novine.nn.hr/clanci/sluzbeni/2022_12_141_2151.html',
|
||||
'https://narodne-novine.nn.hr/clanci/sluzbeni/2024_10_122_2087.html',
|
||||
],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o udrugama',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2014_06_74_1390.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o sprečavanju dopinga',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2017_07_70_1671.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Pravilnik o stručnim poslovima',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o lovstvu',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/full/2018_11_99_1955.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o volonterstvu',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2007_06_58_1813.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o pravu na pristup informacijama',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2013_02_25_403.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'Zakon o sprječavanju nereda',
|
||||
'razina': 'RH',
|
||||
'urls': ['https://narodne-novine.nn.hr/clanci/sluzbeni/2003_07_117_1631.html'],
|
||||
},
|
||||
{
|
||||
'title_pattern': 'GDPR',
|
||||
'razina': 'EU',
|
||||
'urls': ['https://eur-lex.europa.eu/legal-content/HR/TXT/HTML/?uri=CELEX:32016R0679'],
|
||||
},
|
||||
]
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 PGZSport/1.0',
|
||||
'Accept': 'text/html,application/xhtml+xml',
|
||||
}
|
||||
|
||||
def clean_html(html):
|
||||
"""Strip HTML tags, scripts, styles. Return clean text."""
|
||||
# Remove scripts and styles
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
|
||||
html = re.sub(r'<!--.*?-->', '', html, flags=re.DOTALL)
|
||||
# Replace breaks with newlines
|
||||
html = re.sub(r'<br\s*/?>', '\n', html, flags=re.I)
|
||||
html = re.sub(r'</(p|div|h[1-6]|li|tr)\s*>', '\n', html, flags=re.I)
|
||||
# Strip remaining tags
|
||||
html = re.sub(r'<[^>]+>', '', html)
|
||||
# Decode entities
|
||||
html = html.replace(' ', ' ').replace('&', '&')
|
||||
html = html.replace('<', '<').replace('>', '>')
|
||||
html = html.replace('"', '"').replace(''', "'")
|
||||
# Collapse whitespace
|
||||
html = re.sub(r'[ \t]+', ' ', html)
|
||||
html = re.sub(r'\n\s*\n+', '\n\n', html)
|
||||
html = html.strip()
|
||||
return html
|
||||
|
||||
def fetch_url(url, max_size=200000):
|
||||
try:
|
||||
r = requests.get(url, headers=HEADERS, timeout=20)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
text = r.text
|
||||
# NN.hr structure: extract main article body
|
||||
m = re.search(r'<div[^>]*class="[^"]*clanak[^"]*"[^>]*>(.*?)</div>\s*<div[^>]*class="metapodaci"', text, re.DOTALL|re.I)
|
||||
if m:
|
||||
text = m.group(1)
|
||||
else:
|
||||
# Fallback: remove navigation, headers, footers
|
||||
text = re.sub(r'<header.*?</header>', '', text, flags=re.DOTALL|re.I)
|
||||
text = re.sub(r'<footer.*?</footer>', '', text, flags=re.DOTALL|re.I)
|
||||
text = re.sub(r'<nav.*?</nav>', '', text, flags=re.DOTALL|re.I)
|
||||
clean = clean_html(text)
|
||||
return clean[:max_size]
|
||||
except Exception as e:
|
||||
print(f" err fetch {url[:80]}: {e}")
|
||||
return None
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
n_updated = 0
|
||||
n_failed = 0
|
||||
|
||||
for tgt in TARGETS:
|
||||
cu.execute("""SELECT id, title FROM pgz_sport.dokumenti
|
||||
WHERE title ILIKE %s AND razina = %s
|
||||
ORDER BY id LIMIT 1""",
|
||||
(f"%{tgt['title_pattern']}%", tgt['razina']))
|
||||
row = cu.fetchone()
|
||||
if not row:
|
||||
print(f" ⊘ Not found: {tgt['title_pattern']} ({tgt['razina']})")
|
||||
continue
|
||||
|
||||
# Try urls in order until one works
|
||||
full_text = ''
|
||||
used_url = None
|
||||
for url in tgt['urls']:
|
||||
text = fetch_url(url)
|
||||
if text and len(text) > 1000:
|
||||
full_text = text
|
||||
used_url = url
|
||||
break
|
||||
time.sleep(0.5)
|
||||
|
||||
if not full_text:
|
||||
print(f" ✗ {row['title'][:50]} — failed all URLs")
|
||||
n_failed += 1
|
||||
continue
|
||||
|
||||
# Update DB
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti
|
||||
SET sadrzaj = %s, izvor_url = COALESCE(izvor_url, %s)
|
||||
WHERE id = %s""",
|
||||
(full_text, used_url, row['id']))
|
||||
n_updated += 1
|
||||
print(f" ✓ {row['title'][:60]} ({len(full_text)} bytes)")
|
||||
time.sleep(1) # rate limit
|
||||
|
||||
print(f"\nUpdated: {n_updated}, Failed: {n_failed}")
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+121
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
|
||||
import os, re, hashlib, subprocess, requests, psycopg2
|
||||
from datetime import date
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
BASE = "https://sport-pgz.hr/upload/dokumenti"
|
||||
|
||||
GODISNJACI = [
|
||||
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
|
||||
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
|
||||
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
|
||||
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
|
||||
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
|
||||
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
|
||||
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
|
||||
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
|
||||
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
|
||||
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
|
||||
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
|
||||
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
|
||||
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
|
||||
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
|
||||
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
|
||||
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
|
||||
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
|
||||
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
|
||||
]
|
||||
|
||||
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
for year, url in GODISNJACI:
|
||||
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
|
||||
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
|
||||
|
||||
# Download if missing
|
||||
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
|
||||
print(f" [{year}] downloading from {url}")
|
||||
try:
|
||||
r = s.get(url, timeout=120)
|
||||
if r.status_code != 200:
|
||||
print(f" [{year}] HTTP {r.status_code}, skip"); continue
|
||||
with open(pdf_path, "wb") as f: f.write(r.content)
|
||||
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
|
||||
except Exception as e:
|
||||
print(f" [{year}] download failed: {e}"); continue
|
||||
|
||||
# Extract text via pdftotext
|
||||
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
|
||||
print(f" [{year}] extracting text…")
|
||||
try:
|
||||
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
|
||||
check=True, timeout=300, capture_output=True)
|
||||
except Exception as e:
|
||||
print(f" [{year}] pdftotext failed: {e}"); continue
|
||||
|
||||
# Read text
|
||||
try:
|
||||
with open(txt_path, encoding='utf-8', errors='replace') as f:
|
||||
text = f.read()
|
||||
except Exception as e:
|
||||
print(f" [{year}] read failed: {e}"); continue
|
||||
|
||||
if len(text) < 5000:
|
||||
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
|
||||
|
||||
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
|
||||
pages = text.count(chr(12)) + 1
|
||||
|
||||
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
|
||||
if year in ("2023", "2024"):
|
||||
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
|
||||
izdano = date(int(year), 12, 31)
|
||||
|
||||
# Update existing or insert new
|
||||
cu.execute("""SELECT id FROM pgz_sport.dokumenti
|
||||
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
|
||||
ORDER BY id LIMIT 1""",
|
||||
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
|
||||
existing = cu.fetchone()
|
||||
|
||||
if existing:
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti SET
|
||||
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
|
||||
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
|
||||
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
|
||||
WHERE id=%s""",
|
||||
(title, text, sha, int(year), izdano,
|
||||
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
|
||||
existing[0]))
|
||||
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
|
||||
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
|
||||
(title, text, sha, int(year), izdano, url, url, url,
|
||||
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
|
||||
new_id = cu.fetchone()[0]
|
||||
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
|
||||
|
||||
# Final
|
||||
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
|
||||
WHERE vrsta='godisnjak' ORDER BY godina""")
|
||||
print("\n=== Godišnjaci u DB ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
|
||||
conn.close()
|
||||
+121
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
|
||||
import os, re, hashlib, subprocess, requests, psycopg2
|
||||
from datetime import date
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
BASE = "https://sport-pgz.hr/upload/dokumenti"
|
||||
|
||||
GODISNJACI = [
|
||||
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
|
||||
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
|
||||
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
|
||||
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
|
||||
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
|
||||
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
|
||||
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
|
||||
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
|
||||
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
|
||||
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
|
||||
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
|
||||
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
|
||||
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
|
||||
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
|
||||
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
|
||||
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
|
||||
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
|
||||
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
|
||||
]
|
||||
|
||||
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
for year, url in GODISNJACI:
|
||||
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
|
||||
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
|
||||
|
||||
# Download if missing
|
||||
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
|
||||
print(f" [{year}] downloading from {url}")
|
||||
try:
|
||||
r = s.get(url, timeout=120)
|
||||
if r.status_code != 200:
|
||||
print(f" [{year}] HTTP {r.status_code}, skip"); continue
|
||||
with open(pdf_path, "wb") as f: f.write(r.content)
|
||||
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
|
||||
except Exception as e:
|
||||
print(f" [{year}] download failed: {e}"); continue
|
||||
|
||||
# Extract text via pdftotext
|
||||
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
|
||||
print(f" [{year}] extracting text…")
|
||||
try:
|
||||
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
|
||||
check=True, timeout=300, capture_output=True)
|
||||
except Exception as e:
|
||||
print(f" [{year}] pdftotext failed: {e}"); continue
|
||||
|
||||
# Read text
|
||||
try:
|
||||
with open(txt_path, encoding='utf-8', errors='replace') as f:
|
||||
text = f.read()
|
||||
except Exception as e:
|
||||
print(f" [{year}] read failed: {e}"); continue
|
||||
|
||||
if len(text) < 5000:
|
||||
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
|
||||
|
||||
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
|
||||
pages = text.count(chr(12)) + 1
|
||||
|
||||
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
|
||||
if year in ("2023", "2024"):
|
||||
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
|
||||
izdano = date(int(year), 12, 31)
|
||||
|
||||
# Update existing or insert new
|
||||
cu.execute("""SELECT id FROM pgz_sport.dokumenti
|
||||
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
|
||||
ORDER BY id LIMIT 1""",
|
||||
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
|
||||
existing = cu.fetchone()
|
||||
|
||||
if existing:
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti SET
|
||||
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
|
||||
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
|
||||
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
|
||||
WHERE id=%s""",
|
||||
(title, text, sha, int(year), izdano,
|
||||
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
|
||||
existing[0]))
|
||||
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
|
||||
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
|
||||
(title, text, sha, int(year), izdano, url, url, url,
|
||||
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
|
||||
new_id = cu.fetchone()[0]
|
||||
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
|
||||
|
||||
# Final
|
||||
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
|
||||
WHERE vrsta='godisnjak' ORDER BY godina""")
|
||||
print("\n=== Godišnjaci u DB ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
|
||||
conn.close()
|
||||
Executable
+229
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Godišnjak ZS PGŽ 2025 ingest:
|
||||
1) Insert kao full-text dokument
|
||||
2) Update statistika_saveza za 2025
|
||||
3) Update Parasportski savez kontakt
|
||||
4) Insert/update 12 parasportskih klubova
|
||||
5) Mark članove parasporta s flagom
|
||||
"""
|
||||
import psycopg2, re, json, hashlib
|
||||
from datetime import date
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
GODISNJAK_PATH = '/opt/pgz-sport/_data/godisnjak/2025_full.txt'
|
||||
text = open(GODISNJAK_PATH, encoding='utf-8').read()
|
||||
print(f"Loaded godišnjak: {len(text)} chars, {text.count(chr(12))+1} pages")
|
||||
|
||||
# ============================================================
|
||||
# 1) INSERT godišnjak kao dokument
|
||||
# ============================================================
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
|
||||
|
||||
# Provjeri postoji li
|
||||
cu.execute("SELECT id FROM pgz_sport.dokumenti WHERE title ILIKE %s LIMIT 1",
|
||||
('%Sportski godi%njak ZS PG%2025%',))
|
||||
existing = cu.fetchone()
|
||||
|
||||
if existing:
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti SET
|
||||
sadrzaj = %s, sha1 = %s WHERE id = %s""",
|
||||
(text, sha, existing[0]))
|
||||
GOD_DOC_ID = existing[0]
|
||||
print(f"✓ Updated dok #{GOD_DOC_ID}: Sportski godišnjak ZS PGŽ 2025")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, kratak_opis, sadrzaj, vrsta, razina, organizacija,
|
||||
izvor_url, kljucne_rijeci, izdano_datum, sha1, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
|
||||
('Sportski godišnjak Zajednice sportova PGŽ 2025',
|
||||
'Godišnji bilten ZS PGŽ — pregled svih 30 županijskih saveza, statistike, najbolji sportaši, kategorizirani sportaši, manifestacije, financiranje',
|
||||
text, 'godisnjak', 'PGŽ', 'Zajednica sportova PGŽ',
|
||||
'https://zspgz.hr/godisnjak-2025',
|
||||
['godišnjak','ZS PGŽ','2025','statistika','savezi','najbolji sportaši','kategorizirani'],
|
||||
'2026-02-19', sha))
|
||||
GOD_DOC_ID = cu.fetchone()[0]
|
||||
print(f"✓ Inserted dok #{GOD_DOC_ID}: Sportski godišnjak ZS PGŽ 2025")
|
||||
|
||||
# ============================================================
|
||||
# 2) UPDATE statistika_saveza ZA 2025 — pravi brojevi iz godišnjaka
|
||||
# ============================================================
|
||||
# Mapiramo (savez_naziv_pattern → brojke iz PDF-a)
|
||||
STATS_2025 = [
|
||||
# (LIKE pattern, klubova_clanica, registriranih, ukupno_clanova_or_None)
|
||||
('Atletski savez%PG%', 8, 498, 1185),
|
||||
('Boćarski savez%PG%', 63, 950, None),
|
||||
('Boksački savez%PG%', 7, None, None), # iz prošle stat
|
||||
('Jedriličarski savez%PG%', 20, None, None), # 20 klubova
|
||||
('Judo savez%PG%', 9, 870, None),
|
||||
('Karate savez%PG%', 21, 1980, None),
|
||||
('Kickboxing savez%PG%', 12, None, None), # ručno tipovi p.101
|
||||
('Košarkaški savez%PG%', 11, 180, None), # iz preliminarnog parsa
|
||||
('Kuglački savez%PG%', 20, 438, None),
|
||||
('Nogometni savez%PG%', 57, None, None),
|
||||
('Odbojkaški savez%PG%', 15, None, None),
|
||||
('Pikado savez%PG%', None, None, None),
|
||||
('Plivački savez%PG%', 5, None, None),
|
||||
('Rukometni savez%PG%', None, 485, None),
|
||||
('Skijaški savez%PG%', 11, 180, None),
|
||||
('Stolnoteniski savez%PG%', 12, 176, None),
|
||||
('Streličarski savez%PG%', 4, 176, None),
|
||||
('Šahovski savez%PG%', 17, None, None),
|
||||
('Taekwondo savez%PG%', 7, 202, None),
|
||||
('Tenis%savez%PG%', 9, 138, None),
|
||||
('Triatlon savez%PG%', 5, 136, None),
|
||||
('Vaterpolski savez%PG%', 7, 317, None),
|
||||
# Drugi savezi
|
||||
('Sanjkaški savez%PG%', None, 2, None),
|
||||
('%ribolov%moru%', None, None, None),
|
||||
('Udruga %strelj%', 8, 141, None),
|
||||
('Savez školskih sportskih%PG%', 512, None, None), # 512 ŠSD - school sport
|
||||
('%sportske rekreacije%Sport za sve%', None, None, None),
|
||||
('Riječki sportski sveučilišni%', None, None, None),
|
||||
('Parasportski savez%PG%', 12, None, None), # 12 članica
|
||||
]
|
||||
|
||||
for pattern, klubova, reg, ukupno in STATS_2025:
|
||||
cu.execute("SELECT id, naziv FROM pgz_sport.savezi WHERE naziv ILIKE %s LIMIT 1", (pattern,))
|
||||
row = cu.fetchone()
|
||||
if not row:
|
||||
print(f" ✗ not found: {pattern}")
|
||||
continue
|
||||
sid, snaziv = row
|
||||
|
||||
# Upsert statistika za 2025
|
||||
cu.execute("""SELECT id FROM pgz_sport.statistika_saveza
|
||||
WHERE savez_id=%s AND godina=2025""", (sid,))
|
||||
stat = cu.fetchone()
|
||||
|
||||
if stat:
|
||||
cu.execute("""UPDATE pgz_sport.statistika_saveza SET
|
||||
klubova_clanica = COALESCE(%s, klubova_clanica),
|
||||
registriranih = COALESCE(%s, registriranih)
|
||||
WHERE id = %s""", (klubova, reg, stat[0]))
|
||||
print(f" ✓ Updated 2025: {snaziv[:50]} klubova={klubova} reg={reg}")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.statistika_saveza
|
||||
(savez_id, godina, klubova_clanica, registriranih)
|
||||
VALUES (%s, 2025, %s, %s)""", (sid, klubova, reg))
|
||||
print(f" + Inserted 2025: {snaziv[:50]} klubova={klubova} reg={reg}")
|
||||
|
||||
# ============================================================
|
||||
# 3) UPDATE PARASPORTSKI SAVEZ
|
||||
# ============================================================
|
||||
cu.execute("""UPDATE pgz_sport.savezi SET
|
||||
sjediste = COALESCE(sjediste, 'Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka'),
|
||||
web = COALESCE(web, 'http://www.ssoi-pgz.hr'),
|
||||
email = COALESCE(email, 'ssoi-pgz@ssoi-pgz.hr')
|
||||
WHERE naziv ILIKE 'Parasportski savez%PG%'
|
||||
RETURNING id, naziv""")
|
||||
parasport_row = cu.fetchone()
|
||||
if parasport_row:
|
||||
PARASPORT_ID, _ = parasport_row
|
||||
print(f"✓ Parasport savez #{PARASPORT_ID} updated (kontakt)")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.savezi (naziv, razina, sjediste, web, email)
|
||||
VALUES ('Parasportski savez Primorsko-goranske županije', 'zupanijski',
|
||||
'Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka',
|
||||
'http://www.ssoi-pgz.hr', 'ssoi-pgz@ssoi-pgz.hr')
|
||||
RETURNING id""")
|
||||
PARASPORT_ID = cu.fetchone()[0]
|
||||
print(f"+ Parasport savez #{PARASPORT_ID} created")
|
||||
|
||||
# Predsjednik + tajnik u osobe_funkcije
|
||||
def upsert_funkcioner(ime, prezime, funkcija, savez_id=None, klub_id=None):
|
||||
cu.execute("""SELECT id FROM pgz_sport.osobe_funkcije
|
||||
WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s) AND LOWER(funkcija)=LOWER(%s)""",
|
||||
(ime, prezime, funkcija))
|
||||
if not cu.fetchone():
|
||||
cols = "ime, prezime, funkcija"
|
||||
vals = [ime, prezime, funkcija]
|
||||
if savez_id:
|
||||
cols += ", savez_id"
|
||||
vals.append(savez_id)
|
||||
cu.execute(f"INSERT INTO pgz_sport.osobe_funkcije ({cols}) VALUES (" +
|
||||
",".join(["%s"]*len(vals)) + ")", vals)
|
||||
|
||||
upsert_funkcioner('Zvonimir', 'Brozić', 'predsjednik Parasportskog saveza PGŽ', PARASPORT_ID)
|
||||
upsert_funkcioner('Luka', 'Dobrović', 'tajnik Parasportskog saveza PGŽ', PARASPORT_ID)
|
||||
print("✓ Brozić + Dobrović insertirani u osobe_funkcije")
|
||||
|
||||
# ============================================================
|
||||
# 4) INSERT 12 PARASPORTSKIH KLUBOVA
|
||||
# ============================================================
|
||||
PARASPORT_KLUBOVI = [
|
||||
# (naziv, sport_glavni, sportovi_tag, grad, opis)
|
||||
('Paraatletski klub "Srce" Rijeka', 'parasport', 'paratletika', 'Rijeka', 'atletika invalidi'),
|
||||
('Paraplivački klub "Forca"', 'parasport', 'paraplivanje', 'Rijeka', 'plivanje invalidi'),
|
||||
('Parastolnoteniski klub Rijeka', 'parasport', 'parastolni tenis','Rijeka', 'stolni tenis invalidi'),
|
||||
('Parastreljački klub "Paraolimpijac"', 'parasport', 'parastreljaštvo', 'Rijeka', 'streljaštvo invalidi'),
|
||||
('Sportski klub slijepih "Rijeka"', 'parasport', 'sport slijepih', 'Rijeka', 'multisport za slijepe'),
|
||||
('Parasportski boccia klub "Rijeka"', 'parasport', 'parabocce', 'Rijeka', 'boćanje invalidi'),
|
||||
('Klub dresurnog jahanja za osobe s invaliditetom "Pegaz"', 'parasport', 'parajahanje', 'Rijeka', 'dresurno jahanje invalidi'),
|
||||
('Parasportska udruga za rekreaciju "Rijeka"','parasport', 'pararekrejacija', 'Rijeka', 'rekreacija invalidi'),
|
||||
('KKOI Kostrena', 'parasport', 'multisport', 'Kostrena', 'KK osoba s invaliditetom Kostrena'),
|
||||
('PAK "Rijeka"', 'parasport', 'multisport', 'Rijeka', 'parasport. udruga Rijeka'),
|
||||
('Parasportski savez Grada Rijeke', 'parasport', 'multisport', 'Rijeka', 'gradski parasport savez'),
|
||||
('Riječki sportski savez gluhih', 'parasport', 'sport gluhih', 'Rijeka', '5 klubova gluhih: Galeb x4 + DSR'),
|
||||
]
|
||||
|
||||
inserted = 0; updated = 0
|
||||
for naziv, sport, tag, grad, opis in PARASPORT_KLUBOVI:
|
||||
cu.execute("""SELECT id, savez_id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s) LIMIT 1""", (naziv,))
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
kid, old_savez = row
|
||||
cu.execute("""UPDATE pgz_sport.klubovi SET
|
||||
savez_id = %s, sport = %s, region = COALESCE(region, 'PGŽ'),
|
||||
grad = COALESCE(grad, %s), napomena = COALESCE(napomena, %s)
|
||||
WHERE id = %s""", (PARASPORT_ID, sport, grad, opis, kid))
|
||||
updated += 1
|
||||
print(f" ↺ {naziv[:50]} (savez_id: {old_savez} → {PARASPORT_ID})")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, savez_id, sport, region, grad, napomena, aktivan)
|
||||
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
|
||||
(naziv, PARASPORT_ID, sport, grad, opis))
|
||||
inserted += 1
|
||||
print(f" + {naziv[:50]}")
|
||||
|
||||
# Plus 5 gluhih klubova (sub-članovi RSS gluhih)
|
||||
GLUHI_KLUBOVI = [
|
||||
('Streljački klub gluhih "Galeb"', 'streljaštvo', 'Rijeka'),
|
||||
('Malonogometni klub gluhih "Galeb"', 'malonogomet', 'Rijeka'),
|
||||
('Kuglački klub gluhih "Galeb"', 'kuglanje', 'Rijeka'),
|
||||
('Stolnoteniski klub gluhih "Galeb"', 'stolni tenis', 'Rijeka'),
|
||||
('Društvo sportske rekreacije gluhih "Galeb"', 'rekreacija', 'Rijeka'),
|
||||
]
|
||||
for naziv, sport, grad in GLUHI_KLUBOVI:
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s) LIMIT 1""", (naziv,))
|
||||
if not cu.fetchone():
|
||||
cu.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, savez_id, sport, region, grad, napomena, aktivan)
|
||||
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
|
||||
(naziv, PARASPORT_ID, 'parasport-' + sport, grad, 'pridruženi član preko Riječkog SS gluhih'))
|
||||
inserted += 1
|
||||
|
||||
print(f"\nParasport klubovi: inserted={inserted}, updated={updated}")
|
||||
|
||||
# ============================================================
|
||||
# 5) FINAL counts
|
||||
# ============================================================
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.klubovi WHERE savez_id = %s", (PARASPORT_ID,))
|
||||
print(f"\nUkupno parasportskih klubova: {cu.fetchone()[0]}")
|
||||
|
||||
cu.execute("""SELECT count(*) FROM pgz_sport.statistika_saveza
|
||||
WHERE godina = 2025 AND klubova_clanica IS NOT NULL""")
|
||||
print(f"Statistika saveza 2025 (s brojkama): {cu.fetchone()[0]}")
|
||||
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.dokumenti WHERE vrsta = 'godisnjak'")
|
||||
print(f"Godišnjak dokumenti: {cu.fetchone()[0]}")
|
||||
|
||||
conn.close()
|
||||
print("\n✓ Done")
|
||||
Executable
+82
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Mine 18 godišnjaka 2006-2024: extract klub mentions, sportaš results, trophies."""
|
||||
import psycopg2, re, json
|
||||
from collections import defaultdict
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
# Get all godisnjak texts
|
||||
cu.execute("""SELECT id, godina, length(sadrzaj) AS chars, sadrzaj
|
||||
FROM pgz_sport.dokumenti
|
||||
WHERE vrsta='godisnjak' AND godina IS NOT NULL
|
||||
ORDER BY godina""")
|
||||
godisnjaci = cu.fetchall()
|
||||
print(f"Loaded {len(godisnjaci)} godišnjaka")
|
||||
|
||||
# Get all PGZ klubovi for matching
|
||||
cu.execute("""SELECT id, naziv FROM pgz_sport.klubovi WHERE aktivan=true""")
|
||||
klubovi = cu.fetchall()
|
||||
print(f"Active klubova: {len(klubovi)}")
|
||||
|
||||
# Build matching index - extract base name from naziv
|
||||
def base_name(naziv):
|
||||
"""Extract searchable base from club naziv."""
|
||||
n = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK|ŽNK|ŠD|ŠRK|HRK|HŠŠ|KAK|KKM|KKP|HOO|VKK|HMNL|ŽRK|ŠKD|ŠK|ŠHRK)\s+', '', naziv, flags=re.IGNORECASE).strip()
|
||||
n = re.sub(r'\s*\([^)]+\)\s*', ' ', n).strip()
|
||||
n = re.sub(r'^(NOGOMETNI|RUKOMETNI|VATERPOLO|ATLETSKI|TENISKI|KOŠARKAŠKI|BOĆARSKI|JEDRILIČARSKI|KARATE)\s+(KLUB|KLUB\s+)', '', n, flags=re.IGNORECASE)
|
||||
return n.strip()[:50]
|
||||
|
||||
# Index for fast lookup
|
||||
klub_index = [] # (klub_id, naziv, base, base_lower)
|
||||
for kid, naziv in klubovi:
|
||||
if not naziv or len(naziv) < 3: continue
|
||||
b = base_name(naziv)
|
||||
if len(b) < 3: continue
|
||||
klub_index.append((kid, naziv, b, b.lower()))
|
||||
|
||||
# Stats per klub: in which years did it appear?
|
||||
klub_mentions = defaultdict(list) # klub_id → [godina,...]
|
||||
|
||||
# For each godišnjak, find clubs mentioned
|
||||
for did, godina, chars, text in godisnjaci:
|
||||
if not text or len(text) < 5000: continue
|
||||
text_low = text.lower()
|
||||
matched_in_doc = set()
|
||||
for kid, naziv, base, base_low in klub_index:
|
||||
if base_low in text_low:
|
||||
matched_in_doc.add(kid)
|
||||
print(f" godišnjak {godina}: {len(matched_in_doc)} klubova mentioned")
|
||||
for kid in matched_in_doc:
|
||||
klub_mentions[kid].append(godina)
|
||||
|
||||
# Update klubovi with godina_prvog_pojavljivanja and godina_zadnjeg
|
||||
print(f"\n=== Klubovi sa mentions: {len(klub_mentions)} ===")
|
||||
|
||||
# Add new column
|
||||
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
|
||||
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
|
||||
cu.execute("ALTER TABLE pgz_sport.klubovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
|
||||
|
||||
updated = 0
|
||||
for kid, godine in klub_mentions.items():
|
||||
godine_sorted = sorted(set(godine))
|
||||
cu.execute("""UPDATE pgz_sport.klubovi
|
||||
SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s
|
||||
WHERE id=%s""",
|
||||
(godine_sorted, godine_sorted[0], godine_sorted[-1], kid))
|
||||
updated += 1
|
||||
|
||||
print(f"Updated {updated} klubova sa godinama pojavljivanja")
|
||||
|
||||
# Top klubovi by mentions
|
||||
top_klubovi = sorted(klub_mentions.items(), key=lambda x: len(x[1]), reverse=True)[:20]
|
||||
print("\n=== TOP 20 klubova po godinama pojavljivanja ===")
|
||||
for kid, godine in top_klubovi:
|
||||
cu.execute("SELECT naziv FROM pgz_sport.klubovi WHERE id=%s", (kid,))
|
||||
n = cu.fetchone()[0]
|
||||
print(f" {len(godine):2}× {n[:60]}")
|
||||
|
||||
conn.close()
|
||||
Executable
+117
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
import psycopg2
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
# Find Parasport savez ID (already exists)
|
||||
cu.execute("SELECT id FROM pgz_sport.savezi WHERE naziv ILIKE 'Parasportski savez%PG%' LIMIT 1")
|
||||
PARASPORT_ID = cu.fetchone()[0]
|
||||
print(f"Parasport savez ID: {PARASPORT_ID}")
|
||||
|
||||
# Find which column name exists for address
|
||||
cu.execute("""SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema='pgz_sport' AND table_name='savezi'""")
|
||||
cols = [r[0] for r in cu.fetchall()]
|
||||
print(f"Savezi cols: {cols}")
|
||||
|
||||
# Use 'adresa' or whatever exists
|
||||
addr_col = 'adresa' if 'adresa' in cols else ('sjediste' if 'sjediste' in cols else None)
|
||||
web_col = 'web' if 'web' in cols else 'web_stranica'
|
||||
email_col = 'email' if 'email' in cols else None
|
||||
|
||||
set_parts = []
|
||||
vals = []
|
||||
if addr_col:
|
||||
set_parts.append(f'{addr_col} = COALESCE({addr_col}, %s)')
|
||||
vals.append('Šetalište Ivana Gorana Kovačića 14, 51000 Rijeka')
|
||||
if web_col in cols:
|
||||
set_parts.append(f'{web_col} = COALESCE({web_col}, %s)')
|
||||
vals.append('http://www.ssoi-pgz.hr')
|
||||
if email_col and email_col in cols:
|
||||
set_parts.append(f'{email_col} = COALESCE({email_col}, %s)')
|
||||
vals.append('ssoi-pgz@ssoi-pgz.hr')
|
||||
|
||||
if set_parts:
|
||||
sql = f"UPDATE pgz_sport.savezi SET {', '.join(set_parts)} WHERE id = %s"
|
||||
vals.append(PARASPORT_ID)
|
||||
cu.execute(sql, vals)
|
||||
print(f"✓ Parasport kontakt updated: {addr_col}, {web_col}, {email_col}")
|
||||
|
||||
# Predsjednik + tajnik u osobe_funkcije
|
||||
def upsert_funkcioner(ime, prezime, funkcija, savez_id):
|
||||
cu.execute("""SELECT id FROM pgz_sport.osobe_funkcije
|
||||
WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s)""",
|
||||
(ime, prezime))
|
||||
if not cu.fetchone():
|
||||
cu.execute("""INSERT INTO pgz_sport.osobe_funkcije
|
||||
(ime, prezime, funkcija, savez_id) VALUES (%s,%s,%s,%s)""",
|
||||
(ime, prezime, funkcija, savez_id))
|
||||
return True
|
||||
return False
|
||||
|
||||
if upsert_funkcioner('Zvonimir', 'Brozić', 'predsjednik Parasportskog saveza PGŽ', PARASPORT_ID):
|
||||
print("+ Brozić")
|
||||
else:
|
||||
print("⊘ Brozić već postoji")
|
||||
if upsert_funkcioner('Luka', 'Dobrović', 'tajnik Parasportskog saveza PGŽ', PARASPORT_ID):
|
||||
print("+ Dobrović")
|
||||
else:
|
||||
print("⊘ Dobrović već postoji")
|
||||
|
||||
# Insert 12+5 parasportskih klubova
|
||||
PARASPORT_KLUBOVI = [
|
||||
('Paraatletski klub "Srce" Rijeka', 'parasport-atletika', 'Rijeka', 'atletika invalidi'),
|
||||
('Paraplivački klub "Forca"', 'parasport-plivanje', 'Rijeka', 'plivanje invalidi'),
|
||||
('Parastolnoteniski klub Rijeka', 'parasport-stolni tenis','Rijeka', 'stolni tenis invalidi'),
|
||||
('Parastreljački klub "Paraolimpijac"', 'parasport-streljaštvo', 'Rijeka', 'streljaštvo invalidi'),
|
||||
('Sportski klub slijepih "Rijeka"', 'parasport-multisport', 'Rijeka', 'multisport za slijepe'),
|
||||
('Parasportski boccia klub "Rijeka"', 'parasport-bocce', 'Rijeka', 'boćanje invalidi'),
|
||||
('Klub dresurnog jahanja za osobe s invaliditetom "Pegaz"', 'parasport-jahanje', 'Rijeka', 'dresurno jahanje invalidi'),
|
||||
('Parasportska udruga za rekreaciju "Rijeka"','parasport-rekrejacija', 'Rijeka', 'rekreacija invalidi'),
|
||||
('KKOI Kostrena', 'parasport-multisport', 'Kostrena', 'KK osoba s invaliditetom Kostrena'),
|
||||
('PAK "Rijeka"', 'parasport-multisport', 'Rijeka', 'parasport. udruga Rijeka'),
|
||||
('Parasportski savez Grada Rijeke', 'parasport-multisport', 'Rijeka', 'gradski parasport savez'),
|
||||
('Riječki sportski savez gluhih', 'parasport-gluhi', 'Rijeka', '5 klubova: Galeb x4 + DSR'),
|
||||
# 5 gluhih klubova preko RSS gluhih
|
||||
('Streljački klub gluhih "Galeb"', 'parasport-gluhi-streljaštvo', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
|
||||
('Malonogometni klub gluhih "Galeb"', 'parasport-gluhi-malonogomet', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
|
||||
('Kuglački klub gluhih "Galeb"', 'parasport-gluhi-kuglanje', 'Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
|
||||
('Stolnoteniski klub gluhih "Galeb"', 'parasport-gluhi-stolni tenis','Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
|
||||
('Društvo sportske rekreacije gluhih "Galeb"','parasport-gluhi-rekrejacija','Rijeka', 'pridruženi član preko Riječkog SS gluhih'),
|
||||
]
|
||||
|
||||
inserted = 0; updated = 0
|
||||
for naziv, sport_tag, grad, opis in PARASPORT_KLUBOVI:
|
||||
cu.execute("SELECT id, savez_id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) LIMIT 1", (naziv,))
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
kid, old_savez = row
|
||||
cu.execute("""UPDATE pgz_sport.klubovi SET
|
||||
savez_id = %s, sport = %s, region = COALESCE(region, 'PGŽ'),
|
||||
grad = COALESCE(grad, %s), napomena = COALESCE(napomena, %s)
|
||||
WHERE id = %s""", (PARASPORT_ID, sport_tag, grad, opis, kid))
|
||||
updated += 1
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, savez_id, sport, region, grad, napomena, aktivan)
|
||||
VALUES (%s, %s, %s, 'PGŽ', %s, %s, true)""",
|
||||
(naziv, PARASPORT_ID, sport_tag, grad, opis))
|
||||
inserted += 1
|
||||
|
||||
print(f"Parasport klubovi: inserted={inserted}, updated={updated}")
|
||||
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.klubovi WHERE savez_id = %s", (PARASPORT_ID,))
|
||||
print(f"Ukupno parasport klubova: {cu.fetchone()[0]}")
|
||||
|
||||
cu.execute("""SELECT k.naziv, k.sport, k.grad
|
||||
FROM pgz_sport.klubovi k
|
||||
WHERE k.savez_id = %s ORDER BY k.naziv""", (PARASPORT_ID,))
|
||||
print("\nLista parasport klubova:")
|
||||
for naziv, sport, grad in cu.fetchall():
|
||||
print(f" • {naziv} ({sport}) — {grad}")
|
||||
|
||||
conn.close()
|
||||
print("\n✓ Done")
|
||||
Executable
+68
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Fast godišnjak mining: tokenize text, then set-intersect with sportaši names."""
|
||||
import psycopg2, re
|
||||
from collections import defaultdict
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
cu.execute("SELECT id, godina, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina IS NOT NULL ORDER BY godina")
|
||||
godisnjaci = cu.fetchall()
|
||||
print(f"Loaded {len(godisnjaci)} godišnjaka", flush=True)
|
||||
|
||||
# Build map: lowercase "ime prezime" → sportas_id
|
||||
cu.execute("SELECT id, ime, prezime FROM pgz_sport.clanovi WHERE ime IS NOT NULL AND prezime IS NOT NULL")
|
||||
sportasi = cu.fetchall()
|
||||
name_to_ids = defaultdict(set)
|
||||
for sid, ime, prezime in sportasi:
|
||||
if not ime or not prezime: continue
|
||||
full = f"{ime.strip()} {prezime.strip()}".lower()
|
||||
full2 = f"{prezime.strip()} {ime.strip()}".lower()
|
||||
if len(full) >= 8:
|
||||
name_to_ids[full].add(sid)
|
||||
name_to_ids[full2].add(sid)
|
||||
print(f"Indexed {len(name_to_ids)} name variants for {len(sportasi)} sportaša", flush=True)
|
||||
|
||||
# Process each godišnjak: build n-gram set then check
|
||||
mentions = defaultdict(set)
|
||||
for did, godina, text in godisnjaci:
|
||||
if not text or len(text) < 5000: continue
|
||||
text_low = text.lower()
|
||||
# Substring search is fastest for this
|
||||
found_names = 0
|
||||
for name, sids in name_to_ids.items():
|
||||
if name in text_low:
|
||||
for sid in sids:
|
||||
mentions[sid].add(godina)
|
||||
found_names += 1
|
||||
print(f" godišnjak {godina}: {found_names} matches", flush=True)
|
||||
|
||||
print(f"\nTotal sportaša mentioned: {len(mentions)}", flush=True)
|
||||
|
||||
# Update DB
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_godine INT[]")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_prvi INT")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS godisnjak_zadnji INT")
|
||||
|
||||
updated = 0
|
||||
for sid, godine in mentions.items():
|
||||
g = sorted(godine)
|
||||
cu.execute("UPDATE pgz_sport.clanovi SET godisnjak_godine=%s, godisnjak_prvi=%s, godisnjak_zadnji=%s WHERE id=%s",
|
||||
(g, g[0], g[-1], sid))
|
||||
updated += 1
|
||||
|
||||
print(f"\nUpdated {updated} sportaša", flush=True)
|
||||
|
||||
# Top mentioned
|
||||
top = sorted(mentions.items(), key=lambda x: len(x[1]), reverse=True)[:25]
|
||||
print("\nTOP 25 sportaša po godinama:")
|
||||
for sid, godine in top:
|
||||
cu.execute("SELECT ime, prezime, sport, kategorija_hoo FROM pgz_sport.clanovi WHERE id=%s", (sid,))
|
||||
r = cu.fetchone()
|
||||
if r:
|
||||
kh = f" KAT-{r[3]}" if r[3] else ""
|
||||
print(f" {len(godine):2}× {r[0]} {r[1]:<28} ({r[2] or '?'}{kh})")
|
||||
|
||||
conn.close()
|
||||
@@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python3
|
||||
# gov_hr_sport_scraper.py — Ministarstvo turizma i sporta
|
||||
import os, time, hashlib, logging, re, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [gov_sport] %(message)s')
|
||||
log = logging.getLogger("gov_sport")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
ROOTS = [
|
||||
"https://mint.gov.hr",
|
||||
"https://mint.gov.hr/sport-i-rekreacija/87",
|
||||
"https://mint.gov.hr/sport-i-rekreacija/javne-potrebe-u-sportu",
|
||||
"https://sport.gov.hr",
|
||||
"https://hoo.hr",
|
||||
]
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode('utf-8', errors='replace'), r.status
|
||||
except Exception as e:
|
||||
log.warning(f"Fail {url}: {e}")
|
||||
return None, 0
|
||||
|
||||
def extract_text(html):
|
||||
if not html: return ""
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
return re.sub(r'\s+', ' ', unescape(text)).strip().replace('\x00', '')
|
||||
|
||||
def find_links(html, base):
|
||||
if not html: return []
|
||||
out = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
u = urljoin(base, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if any(d in host for d in ['mint.gov.hr', 'sport.gov.hr', 'hoo.hr']):
|
||||
out.append(u)
|
||||
return list(set(out))
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
visited = set(); queue = list(ROOTS)
|
||||
docs = facts = 0
|
||||
while queue and len(visited) < 150:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
time.sleep(2)
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200: continue
|
||||
log.info(f"[{status}] {url[:80]}")
|
||||
text = extract_text(html)
|
||||
if len(text) < 200: continue
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
|
||||
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'MTIS/HOO'))
|
||||
docs += cur.rowcount
|
||||
except Exception as e:
|
||||
pass
|
||||
# Knowledge extract — sport-relevant
|
||||
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'natjecanj', 'olimpij', 'paraolimp']):
|
||||
chunks = [text[i:i+800] for i in range(0, min(len(text), 4000), 800)]
|
||||
for ci, chunk in enumerate(chunks[:4]):
|
||||
if len(chunk) < 200: continue
|
||||
fact_hash = hashlib.sha256((url+str(ci)+chunk[:100]).encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s,'gov_hr_sport','gov_hr_sport_scraper',%s::jsonb,0.85,%s,now())
|
||||
ON CONFLICT (data_hash) DO NOTHING""",
|
||||
(chunk[:1500], json.dumps([{"url":url}]), fact_hash))
|
||||
facts += cur.rowcount
|
||||
except: pass
|
||||
for l in find_links(html, url)[:25]:
|
||||
if l not in visited and l not in queue: queue.append(l)
|
||||
log.info(f"FINAL: visited={len(visited)} docs={docs} facts={facts}")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Executable
+209
@@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Hrvatski boćarski savez (HBS) scraper.
|
||||
|
||||
Strategy:
|
||||
- For each PGŽ-region boćarski klub, try slug from naziv → fetch /klubovi/{slug}/
|
||||
- Parse "Popis igrača" section using regex: "N. E-XXXX, Ime Prezime, GGGG."
|
||||
- Upsert into clanovi with source='hbs_savez', source_id=<reg_broj>
|
||||
|
||||
Modes:
|
||||
python hbs_bocar.py probe <slug> — fetch single klub
|
||||
python hbs_bocar.py klub <db_klub_id> — scrape one klub by DB id
|
||||
python hbs_bocar.py all — sweep all PGŽ-region boćarski klubovi
|
||||
"""
|
||||
import os, re, sys, time, logging
|
||||
from datetime import datetime, date
|
||||
import psycopg2, psycopg2.extras
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
|
||||
RATE_S = 1.0
|
||||
TIMEOUT = 25
|
||||
|
||||
log = logging.getLogger("hbs")
|
||||
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO,
|
||||
handlers=[logging.FileHandler('/opt/pgz-sport/_logs/hbs_scraper.log'), logging.StreamHandler(sys.stdout)])
|
||||
|
||||
def conn(): return psycopg2.connect(**DB)
|
||||
|
||||
def fetch(url):
|
||||
log.info(f"GET {url}")
|
||||
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
time.sleep(RATE_S)
|
||||
return r.text
|
||||
|
||||
def slugify(s):
|
||||
s = s.lower().strip()
|
||||
s = re.sub(r'[čć]','c', s); s = re.sub(r'[š]','s', s)
|
||||
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
|
||||
s = re.sub(r'[^a-z0-9]+','-', s).strip('-')
|
||||
return s
|
||||
|
||||
def naziv_to_slug_candidates(naziv):
|
||||
"""Try multiple slug variants: 'BK Halubjan' → ['halubjan','bk-halubjan','bocarski-klub-halubjan']."""
|
||||
n = naziv.lower()
|
||||
candidates = []
|
||||
# Strip prefix words
|
||||
for prefix in ('boćarski klub', 'bocarski klub', 'b.k.', 'bk', 'b k', 'klub', 'društvo'):
|
||||
if n.startswith(prefix):
|
||||
candidates.append(slugify(n[len(prefix):].strip()))
|
||||
break
|
||||
candidates.append(slugify(n))
|
||||
candidates.append(slugify(n.replace('boćarski','').replace('klub','').strip()))
|
||||
seen = set(); out = []
|
||||
for c in candidates:
|
||||
if c and c not in seen:
|
||||
seen.add(c); out.append(c)
|
||||
return out
|
||||
|
||||
def parse_klub_page(html, klub_url=None):
|
||||
"""Parse boćarski klub page → players list."""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
out = {"klub_url": klub_url, "players": [], "meta": {}}
|
||||
|
||||
# Title — naziv kluba
|
||||
h1 = soup.find('h1')
|
||||
if h1: out['meta']['naziv'] = h1.get_text(' ', strip=True)
|
||||
|
||||
body = (soup.find(class_='entry-content') or soup.find('main') or soup.body or soup)
|
||||
text = body.get_text(' ', strip=True)
|
||||
|
||||
# Extract club meta
|
||||
m_zup = re.search(r'Županija:\s*([^A-Z]+?)(?=Liga|Adresa|$)', text)
|
||||
if m_zup: out['meta']['zupanija'] = m_zup.group(1).strip()
|
||||
m_lig = re.search(r'Liga:\s*([^A-Z]+?)(?=Adresa|Sportske|$)', text)
|
||||
if m_lig: out['meta']['liga'] = m_lig.group(1).strip()
|
||||
m_oib = re.search(r'OIB:\s*(\d{11})', text)
|
||||
if m_oib: out['meta']['oib'] = m_oib.group(1)
|
||||
m_god = re.search(r'osnivanja:\s*(\d{4})', text)
|
||||
if m_god: out['meta']['osnovan'] = int(m_god.group(1))
|
||||
|
||||
# Players — pattern: "N. E-XXXX, Ime Prezime, GGGG."
|
||||
# Variants: E-2755-11, E-02010, E-1317-04, etc.
|
||||
PLAYER_RE = re.compile(r'(\d+)\.\s+(E-[\dA-Z\-]+),\s+([^,]+?),\s+(\d{4})\.?', re.UNICODE)
|
||||
for m in PLAYER_RE.finditer(text):
|
||||
rb = m.group(2).strip()
|
||||
ime_full = m.group(3).strip()
|
||||
god = int(m.group(4))
|
||||
# Split name into ime + prezime (rsplit on space)
|
||||
parts = ime_full.rsplit(' ', 1)
|
||||
ime = parts[0] if len(parts) > 1 else ime_full
|
||||
prezime = parts[1] if len(parts) > 1 else ''
|
||||
out['players'].append({
|
||||
'reg_broj': rb,
|
||||
'ime': ime,
|
||||
'prezime': prezime,
|
||||
'godina_rodenja': god,
|
||||
})
|
||||
|
||||
return out
|
||||
|
||||
def cmd_klub(klub_id_db):
|
||||
with conn() as c:
|
||||
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cu.execute("SELECT id, naziv FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
|
||||
klub = cu.fetchone()
|
||||
if not klub: log.error(f"Klub #{klub_id_db} not found"); return 0
|
||||
|
||||
candidates = naziv_to_slug_candidates(klub['naziv'])
|
||||
log.info(f"Klub: {klub['naziv']} candidates={candidates[:5]}")
|
||||
|
||||
parsed = None
|
||||
used_slug = None
|
||||
for slug in candidates[:5]:
|
||||
url = f"{BASE}/klubovi/{slug}/"
|
||||
try:
|
||||
html = fetch(url)
|
||||
p = parse_klub_page(html, url)
|
||||
if p.get('players'):
|
||||
parsed = p; used_slug = slug; break
|
||||
elif p['meta'].get('naziv'):
|
||||
# Found page but no players — keep searching
|
||||
pass
|
||||
except requests.HTTPError as e:
|
||||
if e.response.status_code != 404:
|
||||
log.warning(f" {slug}: {e}")
|
||||
continue
|
||||
except Exception as e:
|
||||
log.warning(f" {slug}: {e}")
|
||||
continue
|
||||
|
||||
if not parsed:
|
||||
log.warning(f" → no match for {klub['naziv']} (tried {candidates[:5]})")
|
||||
return 0
|
||||
|
||||
# Upsert players
|
||||
n = 0
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
for pl in parsed['players']:
|
||||
url = f"{BASE}/klubovi/{used_slug}/"
|
||||
# source_id = reg_broj (HBS unique)
|
||||
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source='hbs_savez' AND source_id=%s""", (pl['reg_broj'],))
|
||||
row = cu.fetchone()
|
||||
slug = slugify(pl['ime'] + ' ' + pl['prezime'])
|
||||
datum_aprox = f"{pl['godina_rodenja']}-01-01" # only year known
|
||||
if row:
|
||||
cu.execute("""UPDATE pgz_sport.clanovi
|
||||
SET ime=%s, prezime=%s, klub_id=%s, source_url=%s, source_synced_at=now()
|
||||
WHERE id=%s""", (pl['ime'], pl['prezime'], klub_id_db, url, row[0]))
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, datum_rodenja, source, source_id, source_url,
|
||||
source_synced_at, slug, biografija)
|
||||
VALUES (%s,%s,%s,%s,'hbs_savez',%s,%s,now(),%s,%s)""",
|
||||
(klub_id_db, pl['ime'], pl['prezime'], datum_aprox,
|
||||
pl['reg_broj'], url, slug,
|
||||
f"Reg. broj HBS: {pl['reg_broj']} · Godina rođenja: {pl['godina_rodenja']}"))
|
||||
n += 1
|
||||
# Upsert klub OIB if found
|
||||
if parsed['meta'].get('oib'):
|
||||
cu.execute("""UPDATE pgz_sport.klubovi
|
||||
SET oib=COALESCE(NULLIF(oib,''),%s),
|
||||
web_stranica=COALESCE(NULLIF(web_stranica,''), %s),
|
||||
source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(parsed['meta']['oib'], f"{BASE}/klubovi/{used_slug}/", klub_id_db))
|
||||
c.commit()
|
||||
log.info(f" → {n} igrača za {klub['naziv']} (slug={used_slug})")
|
||||
return n
|
||||
|
||||
def cmd_all():
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE sport='boćanje' AND aktivan=true
|
||||
ORDER BY id""")
|
||||
kids = [r[0] for r in cu.fetchall()]
|
||||
log.info(f"Sweeping {len(kids)} boćarski klubovi (PGŽ)")
|
||||
|
||||
total = 0; found_clubs = 0
|
||||
for kid in kids:
|
||||
try:
|
||||
n = cmd_klub(kid)
|
||||
total += n
|
||||
if n > 0: found_clubs += 1
|
||||
except Exception as e:
|
||||
log.error(f"klub {kid}: {e}")
|
||||
log.info(f"DONE: {total} igrača iz {found_clubs}/{len(kids)} klubova")
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2: print(__doc__); sys.exit(1)
|
||||
cmd = sys.argv[1]
|
||||
if cmd == 'probe':
|
||||
html = fetch(f"{BASE}/klubovi/{sys.argv[2]}/")
|
||||
out = parse_klub_page(html)
|
||||
import json
|
||||
print(json.dumps(out, ensure_ascii=False, indent=2))
|
||||
elif cmd == 'klub':
|
||||
cmd_klub(int(sys.argv[2]))
|
||||
elif cmd == 'all':
|
||||
cmd_all()
|
||||
else:
|
||||
print(f"unknown: {cmd}"); sys.exit(2)
|
||||
Executable
+5
@@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
# Daily HBS scrape + lige refresh
|
||||
cd /opt/pgz-sport/scrapers
|
||||
python3 hbs_scraper.py >> /opt/pgz-sport/_logs/hbs_cron.log 2>&1
|
||||
python3 /tmp/b3.py >> /opt/pgz-sport/_logs/hbs_lige_cron.log 2>&1
|
||||
Executable
+161
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
|
||||
import psycopg2, requests, re, html as h, time
|
||||
from datetime import datetime
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
sess = requests.Session()
|
||||
sess.headers.update({"User-Agent": UA})
|
||||
|
||||
LIGE = [
|
||||
# (url, naziv, sezona, razina)
|
||||
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
|
||||
]
|
||||
|
||||
# PGŽ klubovi keywords - to mark relevance
|
||||
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
|
||||
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
|
||||
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
|
||||
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
|
||||
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
|
||||
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
|
||||
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
r = sess.get(url, timeout=20)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except: return None
|
||||
|
||||
def parse_table(html_text):
|
||||
"""Parse HBS liga tablica. Returns list of dicts."""
|
||||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
|
||||
if not rows: return []
|
||||
# First row is header
|
||||
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||||
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
|
||||
|
||||
# Required: Poredak, Klub
|
||||
if not any("oredak" in h or "Poz" in h for h in headers): return []
|
||||
|
||||
out = []
|
||||
for row in rows[1:]:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
||||
if len(cells_clean) < 5: continue
|
||||
try:
|
||||
poz = int(cells_clean[0])
|
||||
naziv = cells_clean[1]
|
||||
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
|
||||
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
|
||||
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
|
||||
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
|
||||
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
|
||||
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
|
||||
razl_str = cells_clean[8].replace('+','').replace('−','-').replace(',','') if len(cells_clean)>8 else "0"
|
||||
try: razl = int(razl_str)
|
||||
except: razl = 0
|
||||
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
|
||||
out.append({
|
||||
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
|
||||
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
def find_klub_id(cur, naziv):
|
||||
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
|
||||
# Try exact
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
|
||||
r = cur.fetchone()
|
||||
if r: return r[0]
|
||||
# Try with BK/MK/ŽBK prefix
|
||||
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
|
||||
(f"%{prefix} {naziv}%",))
|
||||
r = cur.fetchone()
|
||||
if r: return r[0]
|
||||
# Last try: contains
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||||
r = cur.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Find boćanje savez_id
|
||||
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
|
||||
r = cr.fetchone()
|
||||
savez_id = r[0] if r else None
|
||||
print(f"savez_id (HBS): {savez_id}")
|
||||
|
||||
total_natj = 0; total_redova = 0; total_pgz_klub = 0
|
||||
for url, naziv, sezona, razina in LIGE:
|
||||
print(f"\n=== {naziv} {sezona} ===")
|
||||
body = fetch(url)
|
||||
if not body: print(" fetch failed"); continue
|
||||
|
||||
rows = parse_table(body)
|
||||
if not rows: print(" no rows parsed"); continue
|
||||
|
||||
# Check PGZ relevance
|
||||
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
|
||||
|
||||
# Insert natjecanje
|
||||
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
|
||||
cr.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja
|
||||
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
|
||||
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
|
||||
ON CONFLICT (source, external_id) DO UPDATE SET
|
||||
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
|
||||
RETURNING id
|
||||
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
|
||||
natj_id = cr.fetchone()[0]
|
||||
total_natj += 1
|
||||
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
|
||||
|
||||
# Clear old data for this natjecanje + insert new
|
||||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
kid = find_klub_id(cr, r["klub"])
|
||||
if kid: total_pgz_klub += 1
|
||||
cr.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
|
||||
gol_z, gol_p, gol_razlika, bodovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||||
total_redova += 1
|
||||
|
||||
time.sleep(0.6)
|
||||
|
||||
print(f"\n=== TOTAL ===")
|
||||
print(f" natjecanja: {total_natj}")
|
||||
print(f" tablice rows: {total_redova}")
|
||||
print(f" matched klub_id: {total_pgz_klub}")
|
||||
|
||||
# Verify
|
||||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
|
||||
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
|
||||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
|
||||
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
|
||||
|
||||
conn.close()
|
||||
+161
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
"""B.3 — Full HBS liga scraper into natjecanja + natjecanja_tablice."""
|
||||
import psycopg2, requests, re, html as h, time
|
||||
from datetime import datetime
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
sess = requests.Session()
|
||||
sess.headers.update({"User-Agent": UA})
|
||||
|
||||
LIGE = [
|
||||
# (url, naziv, sezona, razina)
|
||||
("https://hrvatski-bocarski-savez.hr/lige/i-hbl/?sezona=2025-2026", "I HBL", "2025/2026", "1.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-sjever/?sezona=2025-2026", "II HBL sjever", "2025/2026", "2.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/ii-hbl-jug/?sezona=2025-2026", "II HBL jug", "2025/2026", "2.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-istra-primorje/?sezona=2025-2026", "III HBL Istra-Primorje", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-zagreb-slavonija/?sezona=2025-2026", "III HBL Zagreb-Slavonija", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-srednja-dalmacija/?sezona=2025-2026", "III HBL srednja Dalmacija", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/iii-hbl-dubrovnik-neretva/?sezona=2025-2026", "III HBL Dubrovnik-Neretva", "2025/2026", "3.HBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-zenska-bocarska-liga/?sezona=2025-2026", "Hrvatska ženska boćarska liga", "2025/2026", "HŽBL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/juniorska-liga/?sezona=2025-2026", "Juniorska liga", "2025/2026", "Juniorska"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/kadetska-liga/?sezona=2025-2026", "Kadetska liga", "2025/2026", "Kadetska"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-raffa-liga/?sezona=2026", "Hrvatska Raffa liga", "2026", "HRL"),
|
||||
("https://hrvatski-bocarski-savez.hr/lige/hrvatska-petanque-liga/?sezona=2026", "Hrvatska Petanque liga", "2026", "HPL"),
|
||||
]
|
||||
|
||||
# PGŽ klubovi keywords - to mark relevance
|
||||
PGZ_KEYWORDS = ["pula","istra","poreč","rijeka","kastav","krimeja","podhum","kukuljanovo","zagon",
|
||||
"gornji kraj","ladvić","sveti rok","klana","sveti jakov","jadranovo","krk","cres",
|
||||
"lošinj","opatija","lovran","brod-moravice","brod moravice","skrad","mošćenice",
|
||||
"mrkopalj","fužine","hreljin","bakar","kostrena","cavle","drenova","srdoči",
|
||||
"vargon","sušak","novi vinodolski","crikvenica","selce","grižane","baška",
|
||||
"punat","omišalj","malinska","vrbnik","mali lošinj","ščelo","mrkopalj","lokve",
|
||||
"delnice","šparta","mošćenička draga","kraljevica","trsat","sušak","plase","matulji"]
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
r = sess.get(url, timeout=20)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except: return None
|
||||
|
||||
def parse_table(html_text):
|
||||
"""Parse HBS liga tablica. Returns list of dicts."""
|
||||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', html_text, re.DOTALL)
|
||||
if not rows: return []
|
||||
# First row is header
|
||||
header_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||||
headers = [re.sub(r'<[^>]+>', '', c).strip() for c in header_cells]
|
||||
|
||||
# Required: Poredak, Klub
|
||||
if not any("oredak" in h or "Poz" in h for h in headers): return []
|
||||
|
||||
out = []
|
||||
for row in rows[1:]:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
cells_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in cells]
|
||||
if len(cells_clean) < 5: continue
|
||||
try:
|
||||
poz = int(cells_clean[0])
|
||||
naziv = cells_clean[1]
|
||||
odigrano = int(cells_clean[2]) if cells_clean[2].isdigit() else 0
|
||||
pobjede = int(cells_clean[3]) if cells_clean[3].isdigit() else 0
|
||||
nerij = int(cells_clean[4]) if cells_clean[4].isdigit() else 0
|
||||
porazi = int(cells_clean[5]) if len(cells_clean)>5 and cells_clean[5].isdigit() else 0
|
||||
gz = int(cells_clean[6]) if len(cells_clean)>6 and cells_clean[6].isdigit() else 0
|
||||
gp = int(cells_clean[7]) if len(cells_clean)>7 and cells_clean[7].isdigit() else 0
|
||||
razl_str = cells_clean[8].replace('+','').replace('−','-').replace(',','') if len(cells_clean)>8 else "0"
|
||||
try: razl = int(razl_str)
|
||||
except: razl = 0
|
||||
bod = int(cells_clean[9]) if len(cells_clean)>9 and cells_clean[9].lstrip('-').isdigit() else 0
|
||||
out.append({
|
||||
"poz": poz, "klub": naziv, "odigrano": odigrano, "pobjede": pobjede,
|
||||
"nerij": nerij, "porazi": porazi, "gz": gz, "gp": gp, "razl": razl, "bod": bod
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
def find_klub_id(cur, naziv):
|
||||
"""Find klub_id by fuzzy match against pgz_sport.klubovi."""
|
||||
# Try exact
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) = LOWER(%s) AND id != 4426 LIMIT 1", (naziv,))
|
||||
r = cur.fetchone()
|
||||
if r: return r[0]
|
||||
# Try with BK/MK/ŽBK prefix
|
||||
for prefix in ["BK", "MK", "ŽBK", "Boćarski klub", "Bočarski klub"]:
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1",
|
||||
(f"%{prefix} {naziv}%",))
|
||||
r = cur.fetchone()
|
||||
if r: return r[0]
|
||||
# Last try: contains
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||||
r = cur.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Find boćanje savez_id
|
||||
cr.execute("SELECT id FROM pgz_sport.savezi WHERE LOWER(naziv) ILIKE '%boćar%' OR LOWER(naziv) ILIKE '%boćan%' OR LOWER(naziv) ILIKE '%hbs%' LIMIT 1")
|
||||
r = cr.fetchone()
|
||||
savez_id = r[0] if r else None
|
||||
print(f"savez_id (HBS): {savez_id}")
|
||||
|
||||
total_natj = 0; total_redova = 0; total_pgz_klub = 0
|
||||
for url, naziv, sezona, razina in LIGE:
|
||||
print(f"\n=== {naziv} {sezona} ===")
|
||||
body = fetch(url)
|
||||
if not body: print(" fetch failed"); continue
|
||||
|
||||
rows = parse_table(body)
|
||||
if not rows: print(" no rows parsed"); continue
|
||||
|
||||
# Check PGZ relevance
|
||||
is_pgz = any(any(kw in r["klub"].lower() for kw in PGZ_KEYWORDS) for r in rows)
|
||||
|
||||
# Insert natjecanje
|
||||
external_id = url.split("?")[0].split("/lige/")[1].rstrip("/").replace("/","_") + "_" + sezona.replace("/","_")
|
||||
cr.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja
|
||||
(sport, savez_id, naziv, razina, tip, sezona, external_id, external_url, source, status, pgz_relevant, source_url)
|
||||
VALUES ('boćanje', %s, %s, %s, 'liga', %s, %s, %s, 'hbs_savez', 'aktivno', %s, %s)
|
||||
ON CONFLICT (source, external_id) DO UPDATE SET
|
||||
updated_at = now(), pgz_relevant = EXCLUDED.pgz_relevant, source_url = EXCLUDED.source_url
|
||||
RETURNING id
|
||||
""", (savez_id, naziv, razina, sezona, external_id, url, is_pgz, url))
|
||||
natj_id = cr.fetchone()[0]
|
||||
total_natj += 1
|
||||
print(f" natjecanje_id: {natj_id} ({len(rows)} klubova) PGZ={is_pgz}")
|
||||
|
||||
# Clear old data for this natjecanje + insert new
|
||||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id = %s", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
kid = find_klub_id(cr, r["klub"])
|
||||
if kid: total_pgz_klub += 1
|
||||
cr.execute("""
|
||||
INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno, porazi,
|
||||
gol_z, gol_p, gol_razlika, bodovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""", (natj_id, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||||
total_redova += 1
|
||||
|
||||
time.sleep(0.6)
|
||||
|
||||
print(f"\n=== TOTAL ===")
|
||||
print(f" natjecanja: {total_natj}")
|
||||
print(f" tablice rows: {total_redova}")
|
||||
print(f" matched klub_id: {total_pgz_klub}")
|
||||
|
||||
# Verify
|
||||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja WHERE source='hbs_savez'")
|
||||
print(f" total HBS natjecanja in DB: {cr.fetchone()[0]}")
|
||||
cr.execute("SELECT count(*) FROM pgz_sport.natjecanja_tablice")
|
||||
print(f" total tablice rows in DB: {cr.fetchone()[0]}")
|
||||
|
||||
conn.close()
|
||||
Executable
+229
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
|
||||
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
|
||||
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
|
||||
"""
|
||||
import os, re, sys, time, json, html as ht
|
||||
import urllib.request, urllib.parse
|
||||
import subprocess
|
||||
import psycopg2
|
||||
import datetime as dt
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
TMP = "/tmp/hbs_pdf"
|
||||
os.makedirs(TMP, exist_ok=True)
|
||||
|
||||
DELAY = 1.0
|
||||
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
|
||||
|
||||
def log(msg):
|
||||
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
||||
print(line, flush=True)
|
||||
try:
|
||||
with open(LOG_FP, "a") as f: f.write(line + "\n")
|
||||
except: pass
|
||||
|
||||
def fetch_html(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
log(f"FETCH err {url}: {e}")
|
||||
return None
|
||||
|
||||
def fetch_pdf(url, dst):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
with open(dst, "wb") as f: f.write(r.read())
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"PDF fetch err {url}: {e}")
|
||||
return False
|
||||
|
||||
def pdf_text(path):
|
||||
try:
|
||||
out = subprocess.run(["pdftotext", "-layout", path, "-"],
|
||||
capture_output=True, timeout=20, check=False)
|
||||
return out.stdout.decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return ""
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def discover_pdfs(godina):
|
||||
"""Discover all PDF result links for a given year from natjecanja pages."""
|
||||
pdfs = []
|
||||
pages = [
|
||||
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
|
||||
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
|
||||
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
|
||||
]
|
||||
for natj, slug in pages:
|
||||
h = fetch_html(BASE + slug)
|
||||
if not h: continue
|
||||
# Find PDF links (they go through /cdn/content/...pdf)
|
||||
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
|
||||
url, label = m.group(1), m.group(2).strip()
|
||||
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
|
||||
pos = m.start()
|
||||
ctx_window = h[max(0, pos-2000):pos]
|
||||
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
|
||||
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
|
||||
kat = kat_m[-1].strip() if kat_m else "?"
|
||||
disc = disc_m[-1].strip() if disc_m else label
|
||||
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
|
||||
time.sleep(DELAY)
|
||||
return pdfs
|
||||
|
||||
def parse_pdf_for_top3(text, pdf_meta):
|
||||
"""
|
||||
Extract top-3 plasmans from a Prvenstvo PDF.
|
||||
Strategy: PDF often has final placements at the END showing
|
||||
"1. Klub | Player A, Player B" etc.
|
||||
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
|
||||
"""
|
||||
results = []
|
||||
|
||||
# Pattern 1: "Pl. Igrač Klub" tabular format at end
|
||||
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
|
||||
# Try simple first: look for medalje/poredak section
|
||||
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
|
||||
if poredak_idx < 0:
|
||||
# Use last 30% of doc
|
||||
poredak_idx = int(len(text) * 0.7)
|
||||
tail = text[poredak_idx:]
|
||||
|
||||
# Lines that start with rank
|
||||
for ln in tail.split("\n"):
|
||||
ln = ln.strip()
|
||||
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
|
||||
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
|
||||
if m:
|
||||
rank = int(m.group(1))
|
||||
ime_full = m.group(2).strip()
|
||||
klub = m.group(3).strip()
|
||||
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
|
||||
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
|
||||
else:
|
||||
# Single name on line followed by line with rank
|
||||
pass
|
||||
|
||||
# Dedup by rank (take first)
|
||||
seen = set(); uniq = []
|
||||
for r in results:
|
||||
if r["rank"] not in seen and r["rank"] <= 8:
|
||||
seen.add(r["rank"])
|
||||
uniq.append(r)
|
||||
return uniq
|
||||
|
||||
def find_clan(cr, ime_full):
|
||||
"""Try to match ime_full to clanovi.id."""
|
||||
parts = ime_full.split()
|
||||
if len(parts) < 2: return None
|
||||
# Try ime+prezime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
|
||||
(parts[0], " ".join(parts[1:])))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try last word as prezime, first as ime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
|
||||
(parts[0], parts[-1]))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try anywhere
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
|
||||
(ime_full,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
return None
|
||||
|
||||
def find_klub(cr, klub_name):
|
||||
if not klub_name: return None
|
||||
cr.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
|
||||
(f"%{klub_name.lower()}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
|
||||
|
||||
def main():
|
||||
conn = db(); cr = conn.cursor()
|
||||
log("=== HBS PDF results scraper START ===")
|
||||
|
||||
all_pdfs = []
|
||||
for godina in [2025, 2024, 2023]:
|
||||
log(f"Discovering year {godina}…")
|
||||
pdfs = discover_pdfs(godina)
|
||||
log(f" {godina}: {len(pdfs)} PDFs")
|
||||
all_pdfs.extend(pdfs)
|
||||
|
||||
log(f"Total PDFs to process: {len(all_pdfs)}")
|
||||
|
||||
inserted = 0; matched_clan = 0; processed = 0
|
||||
|
||||
for pdf in all_pdfs:
|
||||
processed += 1
|
||||
url = pdf["url"]
|
||||
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
|
||||
local = f"{TMP}/{fname}"
|
||||
if not os.path.exists(local):
|
||||
if not fetch_pdf(url, local):
|
||||
continue
|
||||
time.sleep(DELAY)
|
||||
|
||||
text = pdf_text(local)
|
||||
if not text or len(text) < 200:
|
||||
continue
|
||||
|
||||
top3 = parse_pdf_for_top3(text, pdf)
|
||||
if processed % 30 == 0:
|
||||
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
|
||||
|
||||
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
|
||||
natjecanje = f"{pdf['natj']} {pdf['godina']}"
|
||||
|
||||
for r in top3:
|
||||
clan_id = find_clan(cr, r["ime_full"])
|
||||
if clan_id: matched_clan += 1
|
||||
klub_id = find_klub(cr, r["klub"])
|
||||
try:
|
||||
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
|
||||
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
||||
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
|
||||
napomena, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
|
||||
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
|
||||
r["rank"], PLAS_TO_MED.get(r["rank"]),
|
||||
pdf["label"], "hbs_pdf_results", url))
|
||||
if cr.rowcount: inserted += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
|
||||
|
||||
# Show sample of newly inserted
|
||||
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
|
||||
FROM pgz_sport.clan_nagrada
|
||||
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
|
||||
ORDER BY plasman, godina DESC LIMIT 25""")
|
||||
print("\n=== SAMPLE matched ===")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+229
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HBS PDF results scraper - Prvenstvo RH + Kup Hrvatske + Međunarodno natjecanje.
|
||||
Parsira PDF rezultate, ekstrahira plasmane (1-3 = medalje),
|
||||
matcha s PGŽ igračima u DB-u, ubacuje u clan_nagrada.
|
||||
"""
|
||||
import os, re, sys, time, json, html as ht
|
||||
import urllib.request, urllib.parse
|
||||
import subprocess
|
||||
import psycopg2
|
||||
import datetime as dt
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "Mozilla/5.0 (PGZSportBot/1.0)"
|
||||
TMP = "/tmp/hbs_pdf"
|
||||
os.makedirs(TMP, exist_ok=True)
|
||||
|
||||
DELAY = 1.0
|
||||
LOG_FP = "/opt/pgz-sport/_logs/hbs_pdf_results.log"
|
||||
|
||||
def log(msg):
|
||||
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
||||
print(line, flush=True)
|
||||
try:
|
||||
with open(LOG_FP, "a") as f: f.write(line + "\n")
|
||||
except: pass
|
||||
|
||||
def fetch_html(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
log(f"FETCH err {url}: {e}")
|
||||
return None
|
||||
|
||||
def fetch_pdf(url, dst):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=30) as r:
|
||||
with open(dst, "wb") as f: f.write(r.read())
|
||||
return True
|
||||
except Exception as e:
|
||||
log(f"PDF fetch err {url}: {e}")
|
||||
return False
|
||||
|
||||
def pdf_text(path):
|
||||
try:
|
||||
out = subprocess.run(["pdftotext", "-layout", path, "-"],
|
||||
capture_output=True, timeout=20, check=False)
|
||||
return out.stdout.decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
return ""
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def discover_pdfs(godina):
|
||||
"""Discover all PDF result links for a given year from natjecanja pages."""
|
||||
pdfs = []
|
||||
pages = [
|
||||
("Prvenstvo RH", f"/ostala-natjecanja/prvenstvo-rh/?godina={godina}"),
|
||||
("Kup Hrvatske", f"/ostala-natjecanja/kup-hrvatske/?godina={godina}"),
|
||||
("Međunarodno", f"/ostala-natjecanja/medunarodno-natjecanje/?godina={godina}"),
|
||||
]
|
||||
for natj, slug in pages:
|
||||
h = fetch_html(BASE + slug)
|
||||
if not h: continue
|
||||
# Find PDF links (they go through /cdn/content/...pdf)
|
||||
for m in re.finditer(r'href="(https://hrvatski-bocarski-savez\.hr/cdn/content/[^"]+\.pdf)"[^>]*>(?:<i[^>]*></i>\s*)?([^<]+)</a>', h):
|
||||
url, label = m.group(1), m.group(2).strip()
|
||||
# Find context around (last <h3> before this) to detect dobna kategorija (Seniori/Juniori/Kadeti...)
|
||||
pos = m.start()
|
||||
ctx_window = h[max(0, pos-2000):pos]
|
||||
kat_m = re.findall(r'<h3[^>]*>([^<]+)</h3>', ctx_window)
|
||||
disc_m = re.findall(r'<h4[^>]*>([^<:]+)[:]', ctx_window)
|
||||
kat = kat_m[-1].strip() if kat_m else "?"
|
||||
disc = disc_m[-1].strip() if disc_m else label
|
||||
pdfs.append({"natj": natj, "url": url, "label": label, "kat": kat, "disc": disc, "godina": godina})
|
||||
time.sleep(DELAY)
|
||||
return pdfs
|
||||
|
||||
def parse_pdf_for_top3(text, pdf_meta):
|
||||
"""
|
||||
Extract top-3 plasmans from a Prvenstvo PDF.
|
||||
Strategy: PDF often has final placements at the END showing
|
||||
"1. Klub | Player A, Player B" etc.
|
||||
Look for patterns like "1. <name>", "2. <name>", "3. <name>" at end.
|
||||
"""
|
||||
results = []
|
||||
|
||||
# Pattern 1: "Pl. Igrač Klub" tabular format at end
|
||||
# Pattern 2: "1. <name>" at line start, followed by "(<klub>)"
|
||||
# Try simple first: look for medalje/poredak section
|
||||
poredak_idx = max(text.lower().rfind("poredak"), text.lower().rfind("kona\u010dni"))
|
||||
if poredak_idx < 0:
|
||||
# Use last 30% of doc
|
||||
poredak_idx = int(len(text) * 0.7)
|
||||
tail = text[poredak_idx:]
|
||||
|
||||
# Lines that start with rank
|
||||
for ln in tail.split("\n"):
|
||||
ln = ln.strip()
|
||||
# "1. Marko Markovic K.K. KASTAV" - allow optional dot
|
||||
m = re.match(r"^(\d{1,2})\.?\s+([A-ZČĆŠŽĐ][\wčćšžđa-zA-Z\s\.\-]{4,40}?)\s{2,}([A-ZČĆŠŽĐ\.][\w\.\sčćšžđA-Za-z\-]+?)\s*$", ln)
|
||||
if m:
|
||||
rank = int(m.group(1))
|
||||
ime_full = m.group(2).strip()
|
||||
klub = m.group(3).strip()
|
||||
if 1 <= rank <= 12 and len(ime_full) >= 5 and len(klub) >= 3:
|
||||
results.append({"rank": rank, "ime_full": ime_full, "klub": klub})
|
||||
else:
|
||||
# Single name on line followed by line with rank
|
||||
pass
|
||||
|
||||
# Dedup by rank (take first)
|
||||
seen = set(); uniq = []
|
||||
for r in results:
|
||||
if r["rank"] not in seen and r["rank"] <= 8:
|
||||
seen.add(r["rank"])
|
||||
uniq.append(r)
|
||||
return uniq
|
||||
|
||||
def find_clan(cr, ime_full):
|
||||
"""Try to match ime_full to clanovi.id."""
|
||||
parts = ime_full.split()
|
||||
if len(parts) < 2: return None
|
||||
# Try ime+prezime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s)
|
||||
AND LOWER(sport) IN ('boćanje','bocanje','bo\u0107anje') LIMIT 1""",
|
||||
(parts[0], " ".join(parts[1:])))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try last word as prezime, first as ime
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime) = LOWER(%s) AND LOWER(prezime) = LOWER(%s) LIMIT 1""",
|
||||
(parts[0], parts[-1]))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
# Try anywhere
|
||||
cr.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime || ' ' || prezime) = LOWER(%s) AND LOWER(sport) LIKE '%%boć%%' LIMIT 1""",
|
||||
(ime_full,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
return None
|
||||
|
||||
def find_klub(cr, klub_name):
|
||||
if not klub_name: return None
|
||||
cr.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) ILIKE %s ORDER BY LENGTH(naziv) ASC LIMIT 1""",
|
||||
(f"%{klub_name.lower()}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
PLAS_TO_MED = {1: "ZLATO", 2: "SREBRO", 3: "BRONCA"}
|
||||
|
||||
def main():
|
||||
conn = db(); cr = conn.cursor()
|
||||
log("=== HBS PDF results scraper START ===")
|
||||
|
||||
all_pdfs = []
|
||||
for godina in [2025, 2024, 2023]:
|
||||
log(f"Discovering year {godina}…")
|
||||
pdfs = discover_pdfs(godina)
|
||||
log(f" {godina}: {len(pdfs)} PDFs")
|
||||
all_pdfs.extend(pdfs)
|
||||
|
||||
log(f"Total PDFs to process: {len(all_pdfs)}")
|
||||
|
||||
inserted = 0; matched_clan = 0; processed = 0
|
||||
|
||||
for pdf in all_pdfs:
|
||||
processed += 1
|
||||
url = pdf["url"]
|
||||
fname = re.sub(r'[^\w]', '_', url.split("/")[-1])[:80]
|
||||
local = f"{TMP}/{fname}"
|
||||
if not os.path.exists(local):
|
||||
if not fetch_pdf(url, local):
|
||||
continue
|
||||
time.sleep(DELAY)
|
||||
|
||||
text = pdf_text(local)
|
||||
if not text or len(text) < 200:
|
||||
continue
|
||||
|
||||
top3 = parse_pdf_for_top3(text, pdf)
|
||||
if processed % 30 == 0:
|
||||
log(f" Progress {processed}/{len(all_pdfs)}, inserted {inserted}, matched {matched_clan}")
|
||||
|
||||
razina = "DP" if "Prvenstvo" in pdf["natj"] else ("DK" if "Kup" in pdf["natj"] else "OS")
|
||||
natjecanje = f"{pdf['natj']} {pdf['godina']}"
|
||||
|
||||
for r in top3:
|
||||
clan_id = find_clan(cr, r["ime_full"])
|
||||
if clan_id: matched_clan += 1
|
||||
klub_id = find_klub(cr, r["klub"])
|
||||
try:
|
||||
cr.execute("""INSERT INTO pgz_sport.clan_nagrada
|
||||
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
||||
razina_natjecanja, dobna_kategorija, disciplina, plasman, medalja,
|
||||
napomena, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(clan_id, r["ime_full"], klub_id, r["klub"], pdf["godina"],
|
||||
natjecanje, razina, pdf["kat"][:30], pdf["disc"][:60],
|
||||
r["rank"], PLAS_TO_MED.get(r["rank"]),
|
||||
pdf["label"], "hbs_pdf_results", url))
|
||||
if cr.rowcount: inserted += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
log(f"=== DONE: {inserted} new nagrade inserted, {matched_clan} matched to clanovi, processed {processed} PDFs ===")
|
||||
|
||||
# Show sample of newly inserted
|
||||
cr.execute("""SELECT ime_prezime, klub_naziv, godina, plasman, dobna_kategorija, disciplina, natjecanje, clan_id
|
||||
FROM pgz_sport.clan_nagrada
|
||||
WHERE source = 'hbs_pdf_results' AND clan_id IS NOT NULL
|
||||
ORDER BY plasman, godina DESC LIMIT 25""")
|
||||
print("\n=== SAMPLE matched ===")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r[0]:25} ({r[1][:30] if r[1] else '-'}) {r[2]} - {r[3]}. ({r[4]}, {r[5][:30]}) {r[6]} clan={r[7]}")
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+337
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
|
||||
import os, re, sys, time, json, html, traceback, datetime as dt
|
||||
import urllib.request, urllib.error
|
||||
from urllib.parse import urljoin
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
|
||||
DELAY = 1.2
|
||||
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
|
||||
|
||||
def log(msg):
|
||||
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
||||
print(line, flush=True)
|
||||
try:
|
||||
with open(LOG_FP, "a") as f: f.write(line+"\n")
|
||||
except: pass
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def fetch(url, retries=2):
|
||||
for i in range(retries+1):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code in (404, 410): return None
|
||||
if i == retries: log(f"HTTP {e.code} {url}"); return None
|
||||
except Exception as e:
|
||||
if i == retries: log(f"FETCH err {e} {url}"); return None
|
||||
time.sleep(DELAY * 2)
|
||||
|
||||
# === KLUB PARSER ===
|
||||
def parse_klub(h, slug):
|
||||
if not h: return None
|
||||
|
||||
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
|
||||
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
||||
naziv = None
|
||||
for cand in h3s:
|
||||
cand = html.unescape(cand.strip())
|
||||
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
|
||||
naziv = cand; break
|
||||
if not naziv: return None
|
||||
|
||||
# Logo
|
||||
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
|
||||
logo = urljoin(BASE, m.group(1)) if m else None
|
||||
|
||||
info = {}
|
||||
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
|
||||
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
||||
key = m.group(1).strip().lower()
|
||||
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
|
||||
val = html.unescape(val)
|
||||
if 'županija' in key: info['zupanija'] = val
|
||||
elif 'liga' in key: info['liga'] = val
|
||||
elif 'adresa' in key: info['adresa'] = val
|
||||
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
|
||||
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
|
||||
elif 'tel' in key: info['telefon'] = val
|
||||
elif 'oib' in key: info['oib'] = val
|
||||
|
||||
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
|
||||
igraci = []
|
||||
for m in re.finditer(
|
||||
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
|
||||
h
|
||||
):
|
||||
igraci.append({
|
||||
"slug": m.group(1),
|
||||
"iskaznica": m.group(2).strip(),
|
||||
"ime_prezime": html.unescape(m.group(3).strip()),
|
||||
"godina_rodenja": int(m.group(4))
|
||||
})
|
||||
|
||||
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
|
||||
voditelji = []
|
||||
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
|
||||
if vsec:
|
||||
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
|
||||
name = re.sub(r'\s+', ' ', v.group(1).strip())
|
||||
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
|
||||
voditelji.append(name)
|
||||
# fallback bez p tagova
|
||||
if not voditelji:
|
||||
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
|
||||
parts = line.split()
|
||||
if all(p[0].isupper() for p in parts[:2] if p):
|
||||
voditelji.append(line)
|
||||
|
||||
return {
|
||||
"slug": slug, "naziv": naziv, "logo": logo,
|
||||
"info": info,
|
||||
"igraci": igraci,
|
||||
"voditelji": voditelji[:10]
|
||||
}
|
||||
|
||||
# === IGRAČ PARSER ===
|
||||
def parse_igrac(h, slug):
|
||||
if not h: return None
|
||||
|
||||
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
||||
full_name = None
|
||||
for cand in h3s:
|
||||
cand = html.unescape(cand.strip())
|
||||
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
|
||||
full_name = cand; break
|
||||
if not full_name:
|
||||
full_name = slug.replace("-", " ").title()
|
||||
parts = full_name.split()
|
||||
ime = parts[0] if parts else ""
|
||||
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
|
||||
|
||||
# Slika
|
||||
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
|
||||
slika = urljoin(BASE, m.group(1)) if m else None
|
||||
|
||||
info = {}
|
||||
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
||||
key = m.group(1).strip().lower()
|
||||
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
|
||||
val = html.unescape(val)
|
||||
if 'iskaznic' in key: info['iskaznica'] = val
|
||||
elif 'godina rođenja' in key:
|
||||
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
|
||||
except: pass
|
||||
elif 'matični klub' in key: info['maticni_klub'] = val
|
||||
|
||||
# Sportski put - tabela registracije
|
||||
karijera = []
|
||||
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
|
||||
if not table_m:
|
||||
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
|
||||
if table_m:
|
||||
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
|
||||
for r in rows[1:]: # skip header
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
|
||||
if len(cells) >= 4:
|
||||
karijera.append({
|
||||
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
|
||||
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
|
||||
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
|
||||
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
|
||||
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
|
||||
})
|
||||
|
||||
return {
|
||||
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
|
||||
"slika_url": slika,
|
||||
"info": info,
|
||||
"karijera": karijera
|
||||
}
|
||||
|
||||
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
|
||||
PGZ_HBS_CLUBS = [
|
||||
# Senior klubovi
|
||||
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
|
||||
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
|
||||
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
|
||||
# Ženski klubovi
|
||||
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
|
||||
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
|
||||
"zenska-bocarska-ekipa-kastav-2",
|
||||
# Kadetske / juniorske ekipe (mlade kategorije)
|
||||
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
|
||||
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
|
||||
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
|
||||
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
|
||||
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
|
||||
]
|
||||
|
||||
def upsert_klub(conn, k):
|
||||
cur = conn.cursor()
|
||||
# Check by hbs slug in napomena, then by sport+naziv
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
|
||||
row = cur.fetchone()
|
||||
|
||||
info = k.get('info', {})
|
||||
naziv = k['naziv']
|
||||
grad = None
|
||||
if info.get('adresa'):
|
||||
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
|
||||
if m: grad = m.group(1)
|
||||
|
||||
if row:
|
||||
kid = row[0]
|
||||
cur.execute("""UPDATE pgz_sport.klubovi SET
|
||||
adresa=COALESCE(%s, adresa),
|
||||
telefon=COALESCE(%s, telefon),
|
||||
grad=COALESCE(%s, grad),
|
||||
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
|
||||
WHERE id=%s""",
|
||||
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
|
||||
else:
|
||||
cur.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
|
||||
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
|
||||
RETURNING id""",
|
||||
(naziv, grad, info.get('adresa'), info.get('telefon'),
|
||||
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
|
||||
kid = cur.fetchone()[0]
|
||||
return kid
|
||||
|
||||
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
|
||||
cur = conn.cursor()
|
||||
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
|
||||
|
||||
# Check by iskaznica (HBS unique ID)
|
||||
cid = None
|
||||
if iskaznica:
|
||||
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
|
||||
row = cur.fetchone()
|
||||
if row: cid = row[0]
|
||||
|
||||
god = p.get('info', {}).get('godina_rodenja')
|
||||
src_url = f"{BASE}/igraci/{p['slug']}/"
|
||||
|
||||
if cid:
|
||||
cur.execute("""UPDATE pgz_sport.clanovi SET
|
||||
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
|
||||
klub_id=%s, klub_naziv_godisnjak=%s,
|
||||
slika_url=COALESCE(%s, slika_url),
|
||||
godina_rodenja=COALESCE(%s, godina_rodenja),
|
||||
slug=%s,
|
||||
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
||||
god, p['slug'], iskaznica, src_url, cid))
|
||||
else:
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
|
||||
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
|
||||
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
|
||||
RETURNING id""",
|
||||
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
||||
god, p['slug'], iskaznica, src_url))
|
||||
cid = cur.fetchone()[0]
|
||||
return cid
|
||||
|
||||
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
|
||||
"""Voditelji ekipe = treneri"""
|
||||
cur = conn.cursor()
|
||||
parts = name.strip().split()
|
||||
if len(parts) < 2: return None
|
||||
ime, prezime = parts[0], " ".join(parts[1:])
|
||||
|
||||
cur.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
|
||||
(ime, prezime))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
cur.execute("""UPDATE pgz_sport.clanovi SET
|
||||
uloga=%s, klub_id=COALESCE(klub_id, %s),
|
||||
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
|
||||
source_url=COALESCE(source_url, %s)
|
||||
WHERE id=%s""",
|
||||
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
|
||||
return row[0]
|
||||
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
|
||||
source, source_url, source_synced_at)
|
||||
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
|
||||
RETURNING id""",
|
||||
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def main():
|
||||
conn = db()
|
||||
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
|
||||
|
||||
success = 0; players_total = 0
|
||||
for slug in PGZ_HBS_CLUBS:
|
||||
url = f"{BASE}/klubovi/{slug}/"
|
||||
log(f"→ KLUB {slug}")
|
||||
h = fetch(url)
|
||||
if not h: log(f" ✗ klub ne postoji ili 404"); continue
|
||||
|
||||
parsed = parse_klub(h, slug)
|
||||
if not parsed: log(f" ✗ ne mogu parse"); continue
|
||||
|
||||
kid = upsert_klub(conn, parsed)
|
||||
log(f" ✓ {parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
|
||||
success += 1
|
||||
|
||||
# Voditelji
|
||||
for v in parsed['voditelji']:
|
||||
try:
|
||||
upsert_voditelj(conn, v, kid, parsed['naziv'])
|
||||
log(f" ✓ voditelj: {v}")
|
||||
except Exception as e:
|
||||
log(f" ✗ voditelj {v}: {e}")
|
||||
|
||||
# Igrači - dohvati profil za svakog
|
||||
for ig in parsed['igraci']:
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
purl = f"{BASE}/igraci/{ig['slug']}/"
|
||||
ph = fetch(purl)
|
||||
if not ph: continue
|
||||
pdata = parse_igrac(ph, ig['slug'])
|
||||
if not pdata: continue
|
||||
# Override sa fallback iz lista ako parser pogrešno
|
||||
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
|
||||
pdata['full_name'] = ig['ime_prezime']
|
||||
parts = ig['ime_prezime'].split()
|
||||
pdata['ime'] = parts[0] if parts else ''
|
||||
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
|
||||
# Iskaznica from list (mora biti tu)
|
||||
if not pdata.get('info', {}).get('iskaznica'):
|
||||
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
|
||||
if not pdata.get('info', {}).get('godina_rodenja'):
|
||||
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
|
||||
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
|
||||
players_total += 1
|
||||
log(f" ✓ {pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
|
||||
except Exception as e:
|
||||
log(f" ✗ igrač {ig['slug']}: {e}")
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+337
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HBS Hrvatski boćarski savez scraper - prava bota za PGŽ klubove."""
|
||||
import os, re, sys, time, json, html, traceback, datetime as dt
|
||||
import urllib.request, urllib.error
|
||||
from urllib.parse import urljoin
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3", user="rinet", password="R1net2026!SecureDB#v7")
|
||||
BASE = "https://hrvatski-bocarski-savez.hr"
|
||||
UA = "Mozilla/5.0 (compatible; PGZSportBot/1.0; +https://api.rinet.one/sport)"
|
||||
DELAY = 1.2
|
||||
LOG_FP = "/opt/pgz-sport/_logs/hbs_scraper.log"
|
||||
|
||||
def log(msg):
|
||||
line = f"[{dt.datetime.now().isoformat()}] {msg}"
|
||||
print(line, flush=True)
|
||||
try:
|
||||
with open(LOG_FP, "a") as f: f.write(line+"\n")
|
||||
except: pass
|
||||
|
||||
def db():
|
||||
c = psycopg2.connect(**DB); c.autocommit = True; return c
|
||||
|
||||
def fetch(url, retries=2):
|
||||
for i in range(retries+1):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode("utf-8", errors="replace")
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code in (404, 410): return None
|
||||
if i == retries: log(f"HTTP {e.code} {url}"); return None
|
||||
except Exception as e:
|
||||
if i == retries: log(f"FETCH err {e} {url}"); return None
|
||||
time.sleep(DELAY * 2)
|
||||
|
||||
# === KLUB PARSER ===
|
||||
def parse_klub(h, slug):
|
||||
if not h: return None
|
||||
|
||||
# Naziv - drugi h3 (prvi je uvijek "Fédération Croate de Boules")
|
||||
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
||||
naziv = None
|
||||
for cand in h3s:
|
||||
cand = html.unescape(cand.strip())
|
||||
if cand and 'Fédération' not in cand and 'sponzor' not in cand.lower() and len(cand) < 80:
|
||||
naziv = cand; break
|
||||
if not naziv: return None
|
||||
|
||||
# Logo
|
||||
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Klub"', h)
|
||||
logo = urljoin(BASE, m.group(1)) if m else None
|
||||
|
||||
info = {}
|
||||
# Bullets: Županija, Liga, Adresa, Sportske grane, Osoba za kontakt, E-mail, Tel/mob, OIB
|
||||
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
||||
key = m.group(1).strip().lower()
|
||||
val = re.sub(r'<[^>]+>', '', m.group(2).strip())
|
||||
val = html.unescape(val)
|
||||
if 'županija' in key: info['zupanija'] = val
|
||||
elif 'liga' in key: info['liga'] = val
|
||||
elif 'adresa' in key: info['adresa'] = val
|
||||
elif 'sportske grane' in key or 'sportska grana' in key: info['sportske_grane'] = val
|
||||
elif 'osoba za kontakt' in key: info['kontakt_osoba'] = val
|
||||
elif 'tel' in key: info['telefon'] = val
|
||||
elif 'oib' in key: info['oib'] = val
|
||||
|
||||
# Igrači - pattern: <li><a href="...igraci/SLUG/">N. E-XX-YY, <strong>Ime</strong>, YYYY.</a></li>
|
||||
igraci = []
|
||||
for m in re.finditer(
|
||||
r'<li><a\s+href="https?://[^/]+/igraci/([\w\-]+)/?"[^>]*>\s*\d+\.\s*([A-Z][\d\-]+),\s*<strong>([^<]+)</strong>,\s*(\d{4})\.?',
|
||||
h
|
||||
):
|
||||
igraci.append({
|
||||
"slug": m.group(1),
|
||||
"iskaznica": m.group(2).strip(),
|
||||
"ime_prezime": html.unescape(m.group(3).strip()),
|
||||
"godina_rodenja": int(m.group(4))
|
||||
})
|
||||
|
||||
# Voditelji ekipe (treneri) - tab #popis_voditelja_ekipe
|
||||
voditelji = []
|
||||
vsec = re.search(r'id="popis_voditelja_ekipe"[^>]*>(.*?)(?:<div\s+(?:role|class)|</section>|<!--)', h, re.S)
|
||||
if vsec:
|
||||
for v in re.finditer(r'<p[^>]*>\s*([A-ZČĆĐŠŽ][\wčćđšžČĆĐŠŽ\s\-]{2,40}[A-ZČĆĐŠŽ][a-zčćđšž]+)\s*</p>', vsec.group(1)):
|
||||
name = re.sub(r'\s+', ' ', v.group(1).strip())
|
||||
if len(name) > 4 and len(name.split()) >= 2 and 'Trenutno' not in name and name not in voditelji:
|
||||
voditelji.append(name)
|
||||
# fallback bez p tagova
|
||||
if not voditelji:
|
||||
text = re.sub(r'<[^>]+>', '\n', vsec.group(1))
|
||||
for line in text.split('\n'):
|
||||
line = line.strip()
|
||||
if len(line) > 4 and len(line.split()) >= 2 and 'Trenutno' not in line:
|
||||
parts = line.split()
|
||||
if all(p[0].isupper() for p in parts[:2] if p):
|
||||
voditelji.append(line)
|
||||
|
||||
return {
|
||||
"slug": slug, "naziv": naziv, "logo": logo,
|
||||
"info": info,
|
||||
"igraci": igraci,
|
||||
"voditelji": voditelji[:10]
|
||||
}
|
||||
|
||||
# === IGRAČ PARSER ===
|
||||
def parse_igrac(h, slug):
|
||||
if not h: return None
|
||||
|
||||
h3s = re.findall(r'<h3[^>]*>\s*([^<]+?)\s*</h3>', h)
|
||||
full_name = None
|
||||
for cand in h3s:
|
||||
cand = html.unescape(cand.strip())
|
||||
if cand and 'Fédération' not in cand and 'Sport' not in cand and len(cand) < 80 and len(cand.split()) >= 2:
|
||||
full_name = cand; break
|
||||
if not full_name:
|
||||
full_name = slug.replace("-", " ").title()
|
||||
parts = full_name.split()
|
||||
ime = parts[0] if parts else ""
|
||||
prezime = " ".join(parts[1:]) if len(parts)>1 else ""
|
||||
|
||||
# Slika
|
||||
m = re.search(r'<img[^>]+src="(/cdn/content/[^"]+\.(jpg|png|jpeg))"[^>]+alt="Igrač"', h)
|
||||
slika = urljoin(BASE, m.group(1)) if m else None
|
||||
|
||||
info = {}
|
||||
for m in re.finditer(r'<strong>([^<]+?):</strong>\s*([^<\n]+?)(?:<|\n)', h):
|
||||
key = m.group(1).strip().lower()
|
||||
val = re.sub(r'<[^>]+>', '', m.group(2).strip()).rstrip('.')
|
||||
val = html.unescape(val)
|
||||
if 'iskaznic' in key: info['iskaznica'] = val
|
||||
elif 'godina rođenja' in key:
|
||||
try: info['godina_rodenja'] = int(re.search(r'(\d{4})', val).group(1))
|
||||
except: pass
|
||||
elif 'matični klub' in key: info['maticni_klub'] = val
|
||||
|
||||
# Sportski put - tabela registracije
|
||||
karijera = []
|
||||
table_m = re.search(r'### Sportski put.*?</table>', h, re.S)
|
||||
if not table_m:
|
||||
table_m = re.search(r'Sportski put.*?</table>', h, re.S)
|
||||
if table_m:
|
||||
rows = re.findall(r'<tr>(.*?)</tr>', table_m.group(0), re.S)
|
||||
for r in rows[1:]: # skip header
|
||||
cells = re.findall(r'<td[^>]*>(.*?)</td>', r, re.S)
|
||||
if len(cells) >= 4:
|
||||
karijera.append({
|
||||
"datum_reg": re.sub(r'<[^>]+>', '', cells[0]).strip().rstrip('.'),
|
||||
"klub": re.sub(r'<[^>]+>', '', cells[1]).strip(),
|
||||
"sportska_grana": re.sub(r'<[^>]+>', '', cells[2]).strip(),
|
||||
"sezona": re.sub(r'<[^>]+>', '', cells[3]).strip(),
|
||||
"lijecnicki": re.sub(r'<[^>]+>', '', cells[4]).strip().rstrip('.') if len(cells) > 4 else None
|
||||
})
|
||||
|
||||
return {
|
||||
"slug": slug, "ime": ime, "prezime": prezime, "full_name": full_name,
|
||||
"slika_url": slika,
|
||||
"info": info,
|
||||
"karijera": karijera
|
||||
}
|
||||
|
||||
# PGŽ klubovi - pravi slug-ovi sa bocarski-savez-pgz
|
||||
PGZ_HBS_CLUBS = [
|
||||
# Senior klubovi
|
||||
"kastav", "kostrena", "krenovac", "krimeja", "krk", "lovran", "opatija",
|
||||
"rijeka-2", "srdoci", "sveti-jakov", "sveti-rok-klana", "vargon", "hreljin",
|
||||
"draga-moscenicka-draga", "lovranska-draga", "brod-moravice",
|
||||
# Ženski klubovi
|
||||
"zenski-bocarski-klub-cavle", "zenski-bocarski-klub-drenova-rijeka",
|
||||
"zenski-bocarski-klub-hreljin", "zenski-bocarski-klub-kastav",
|
||||
"zenska-bocarska-ekipa-kastav-2",
|
||||
# Kadetske / juniorske ekipe (mlade kategorije)
|
||||
"cavle-skola-bocanja", "juniorska-ekipa-cavle-sb-1", "juniorska-ekipa-kastav",
|
||||
"juniorska-ekipa-lovran", "juniorska-ekipa-sv-rok-klana", "juniorska-ekipa-vargon",
|
||||
"kadetska-ekipa-bk-cavle-sb-2", "kadetska-ekipa-bk-kastav-2",
|
||||
"kadetska-ekipa-bk-lovran", "kadetska-ekipa-bk-sveti-jakov-2",
|
||||
"kadetska-ekipa-bk-vargon", "kadetska-ekipa-kastav", "kadetska-ekipa-zbk-drenova",
|
||||
]
|
||||
|
||||
def upsert_klub(conn, k):
|
||||
cur = conn.cursor()
|
||||
# Check by hbs slug in napomena, then by sport+naziv
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE napomena ILIKE %s LIMIT 1", (f"%hbs:{k['slug']}%",))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
cur.execute("SELECT id FROM pgz_sport.klubovi WHERE sport='boćanje' AND lower(naziv)=lower(%s) LIMIT 1", (k['naziv'],))
|
||||
row = cur.fetchone()
|
||||
|
||||
info = k.get('info', {})
|
||||
naziv = k['naziv']
|
||||
grad = None
|
||||
if info.get('adresa'):
|
||||
m = re.search(r'(\w+(?:\s+\w+)?)$', info['adresa'].strip())
|
||||
if m: grad = m.group(1)
|
||||
|
||||
if row:
|
||||
kid = row[0]
|
||||
cur.execute("""UPDATE pgz_sport.klubovi SET
|
||||
adresa=COALESCE(%s, adresa),
|
||||
telefon=COALESCE(%s, telefon),
|
||||
grad=COALESCE(%s, grad),
|
||||
napomena=COALESCE(napomena,'') || ' [HBS sync ' || CURRENT_DATE || ': hbs:' || %s || ']'
|
||||
WHERE id=%s""",
|
||||
(info.get('adresa'), info.get('telefon'), grad, k['slug'], kid))
|
||||
else:
|
||||
cur.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, sport, region, grad, adresa, telefon, aktivan, napomena)
|
||||
VALUES (%s, 'boćanje', 'PGŽ', %s, %s, %s, true, %s)
|
||||
RETURNING id""",
|
||||
(naziv, grad, info.get('adresa'), info.get('telefon'),
|
||||
f"[HBS sync {dt.date.today()}: hbs:{k['slug']}, OIB:{info.get('oib','-')}, liga:{info.get('liga','-')}]"))
|
||||
kid = cur.fetchone()[0]
|
||||
return kid
|
||||
|
||||
def upsert_igrac(conn, p, klub_db_id, klub_naziv):
|
||||
cur = conn.cursor()
|
||||
iskaznica = (p.get('info', {}).get('iskaznica') or '').strip()
|
||||
|
||||
# Check by iskaznica (HBS unique ID)
|
||||
cid = None
|
||||
if iskaznica:
|
||||
cur.execute("SELECT id FROM pgz_sport.clanovi WHERE source='hbs_savez' AND source_id=%s", (iskaznica,))
|
||||
row = cur.fetchone()
|
||||
if row: cid = row[0]
|
||||
|
||||
god = p.get('info', {}).get('godina_rodenja')
|
||||
src_url = f"{BASE}/igraci/{p['slug']}/"
|
||||
|
||||
if cid:
|
||||
cur.execute("""UPDATE pgz_sport.clanovi SET
|
||||
ime=%s, prezime=%s, sport='boćanje', uloga='igrac',
|
||||
klub_id=%s, klub_naziv_godisnjak=%s,
|
||||
slika_url=COALESCE(%s, slika_url),
|
||||
godina_rodenja=COALESCE(%s, godina_rodenja),
|
||||
slug=%s,
|
||||
source='hbs_savez', source_id=%s, source_url=%s, source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
||||
god, p['slug'], iskaznica, src_url, cid))
|
||||
else:
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak, slika_url,
|
||||
godina_rodenja, slug, source, source_id, source_url, source_synced_at)
|
||||
VALUES (%s, %s, 'boćanje', 'igrac', %s, %s, %s, %s, %s, 'hbs_savez', %s, %s, now())
|
||||
RETURNING id""",
|
||||
(p['ime'], p['prezime'], klub_db_id, klub_naziv, p.get('slika_url'),
|
||||
god, p['slug'], iskaznica, src_url))
|
||||
cid = cur.fetchone()[0]
|
||||
return cid
|
||||
|
||||
def upsert_voditelj(conn, name, klub_db_id, klub_naziv, role='trener'):
|
||||
"""Voditelji ekipe = treneri"""
|
||||
cur = conn.cursor()
|
||||
parts = name.strip().split()
|
||||
if len(parts) < 2: return None
|
||||
ime, prezime = parts[0], " ".join(parts[1:])
|
||||
|
||||
cur.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE lower(ime)=lower(%s) AND lower(prezime)=lower(%s) AND sport='boćanje'""",
|
||||
(ime, prezime))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
cur.execute("""UPDATE pgz_sport.clanovi SET
|
||||
uloga=%s, klub_id=COALESCE(klub_id, %s),
|
||||
klub_naziv_godisnjak=COALESCE(klub_naziv_godisnjak, %s),
|
||||
source_url=COALESCE(source_url, %s)
|
||||
WHERE id=%s""",
|
||||
(role, klub_db_id, klub_naziv, f"{BASE}/klubovi/", row[0]))
|
||||
return row[0]
|
||||
|
||||
cur.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(ime, prezime, sport, uloga, klub_id, klub_naziv_godisnjak,
|
||||
source, source_url, source_synced_at)
|
||||
VALUES (%s, %s, 'boćanje', %s, %s, %s, 'hbs_savez', %s, now())
|
||||
RETURNING id""",
|
||||
(ime, prezime, role, klub_db_id, klub_naziv, f"{BASE}/klubovi/"))
|
||||
return cur.fetchone()[0]
|
||||
|
||||
def main():
|
||||
conn = db()
|
||||
log(f"=== HBS scraper START - {len(PGZ_HBS_CLUBS)} kandidata ===")
|
||||
|
||||
success = 0; players_total = 0
|
||||
for slug in PGZ_HBS_CLUBS:
|
||||
url = f"{BASE}/klubovi/{slug}/"
|
||||
log(f"→ KLUB {slug}")
|
||||
h = fetch(url)
|
||||
if not h: log(f" ✗ klub ne postoji ili 404"); continue
|
||||
|
||||
parsed = parse_klub(h, slug)
|
||||
if not parsed: log(f" ✗ ne mogu parse"); continue
|
||||
|
||||
kid = upsert_klub(conn, parsed)
|
||||
log(f" ✓ {parsed['naziv']} (db_id={kid}) igrača={len(parsed['igraci'])} voditelja={len(parsed['voditelji'])}")
|
||||
success += 1
|
||||
|
||||
# Voditelji
|
||||
for v in parsed['voditelji']:
|
||||
try:
|
||||
upsert_voditelj(conn, v, kid, parsed['naziv'])
|
||||
log(f" ✓ voditelj: {v}")
|
||||
except Exception as e:
|
||||
log(f" ✗ voditelj {v}: {e}")
|
||||
|
||||
# Igrači - dohvati profil za svakog
|
||||
for ig in parsed['igraci']:
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
purl = f"{BASE}/igraci/{ig['slug']}/"
|
||||
ph = fetch(purl)
|
||||
if not ph: continue
|
||||
pdata = parse_igrac(ph, ig['slug'])
|
||||
if not pdata: continue
|
||||
# Override sa fallback iz lista ako parser pogrešno
|
||||
if 'Fédération' in pdata.get('full_name','') or pdata['ime'].lower() == 'fédération':
|
||||
pdata['full_name'] = ig['ime_prezime']
|
||||
parts = ig['ime_prezime'].split()
|
||||
pdata['ime'] = parts[0] if parts else ''
|
||||
pdata['prezime'] = ' '.join(parts[1:]) if len(parts)>1 else ''
|
||||
# Iskaznica from list (mora biti tu)
|
||||
if not pdata.get('info', {}).get('iskaznica'):
|
||||
pdata.setdefault('info', {})['iskaznica'] = ig.get('iskaznica')
|
||||
if not pdata.get('info', {}).get('godina_rodenja'):
|
||||
pdata.setdefault('info', {})['godina_rodenja'] = ig.get('godina_rodenja')
|
||||
cid = upsert_igrac(conn, pdata, kid, parsed['naziv'])
|
||||
players_total += 1
|
||||
log(f" ✓ {pdata['ime']} {pdata['prezime']} (db={cid}, god={pdata.get('info',{}).get('godina_rodenja')})")
|
||||
except Exception as e:
|
||||
log(f" ✗ igrač {ig['slug']}: {e}")
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
log(f"=== DONE: {success} klubova, {players_total} igrača ===")
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+173
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
|
||||
Path: /opt/pgz-sport/scrapers/hks_scraper.py
|
||||
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
|
||||
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
|
||||
"""
|
||||
import requests, re, json, psycopg2, html as ihtml
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
|
||||
|
||||
# Competition ID lookup - 2025/26 sezona
|
||||
COMPS = [
|
||||
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
|
||||
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
|
||||
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
|
||||
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
|
||||
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
|
||||
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
|
||||
]
|
||||
|
||||
def parse_standings(html):
|
||||
"""Parse Genius Sports standings HTML table → list of rows."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
if not tables: return []
|
||||
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
|
||||
if len(rows_html) < 2: return []
|
||||
|
||||
standings = []
|
||||
for row in rows_html[1:]: # skip header
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
if len(cells) < 8: continue
|
||||
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
|
||||
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
|
||||
try:
|
||||
poz = int(clean[0])
|
||||
except (ValueError, IndexError): continue
|
||||
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
|
||||
team_raw = clean[2]
|
||||
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
|
||||
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
|
||||
if m:
|
||||
klub = m.group(1).strip()
|
||||
abbrev = m.group(2)
|
||||
else:
|
||||
klub = team_raw
|
||||
abbrev = None
|
||||
|
||||
try:
|
||||
standings.append({
|
||||
"poz": poz,
|
||||
"klub": klub,
|
||||
"abbrev": abbrev,
|
||||
"gp": int(clean[3]),
|
||||
"bod": int(clean[4]),
|
||||
"pob": int(clean[5]),
|
||||
"por": int(clean[6]),
|
||||
"for_pts": int(clean[7].replace(',','')),
|
||||
"ag_pts": int(clean[8].replace(',','')),
|
||||
"gd": int(clean[9].replace(',','').replace('+','')),
|
||||
"ner": 0, # košarka nema neriješeno
|
||||
})
|
||||
except (ValueError, IndexError) as e:
|
||||
continue
|
||||
return standings
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
total_inserted = 0
|
||||
pgz_klubovi_seen = set()
|
||||
|
||||
for comp in COMPS:
|
||||
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
|
||||
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
|
||||
try:
|
||||
r = requests.get(api_url, headers=HDR, timeout=20)
|
||||
d = r.json()
|
||||
html = d.get('html', '')
|
||||
except Exception as e:
|
||||
print(f" ERR fetch: {e}"); continue
|
||||
|
||||
rows = parse_standings(html)
|
||||
print(f" Parsed {len(rows)} klubova")
|
||||
|
||||
# Get/create natjecanje
|
||||
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
|
||||
nr = cu.fetchone()
|
||||
if nr:
|
||||
natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
||||
(naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
|
||||
RETURNING id""",
|
||||
(comp['natj'], comp['razina'], comp['url']))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
# Clear old rows
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
# Match klub
|
||||
klub_id = None
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s)
|
||||
OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
if kr:
|
||||
klub_id = kr[0]
|
||||
# Check if PGŽ
|
||||
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
|
||||
kdata = cu.fetchone()
|
||||
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
|
||||
pgz_klubovi_seen.add(r['klub'])
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
||||
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
||||
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
|
||||
'hks_genius', %s, now(), %s::jsonb)""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
|
||||
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
|
||||
comp['url'],
|
||||
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
|
||||
total_inserted += 1
|
||||
|
||||
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
|
||||
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
|
||||
|
||||
# Stats
|
||||
cu.execute("""SELECT n.naziv, count(t.*),
|
||||
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
|
||||
FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
|
||||
WHERE n.source='hks_genius'
|
||||
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HKS lige stats ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
||||
|
||||
# PGŽ klubovi
|
||||
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hks_genius' AND k.region='PGŽ'
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HKS ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
|
||||
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+173
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HKS-CBF scraper - parses Genius Sports embed standings for HR košarka leagues.
|
||||
Path: /opt/pgz-sport/scrapers/hks_scraper.py
|
||||
Source: https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
|
||||
Output: pgz_sport.natjecanja_tablice + pgz_sport.klubovi (matching)
|
||||
"""
|
||||
import requests, re, json, psycopg2, html as ihtml
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA, "Accept": "application/json", "Referer": "https://www.hks-cbf.hr/"}
|
||||
|
||||
# Competition ID lookup - 2025/26 sezona
|
||||
COMPS = [
|
||||
{"natj": "Supersport Premijer Liga (M) 2025/26", "razina": "Premijer M", "comp_id": 42186, "spol": "M",
|
||||
"url": "https://www.hks-cbf.hr/supersport-premijer/"},
|
||||
{"natj": "Supersport Premijer Liga (Ž) 2025/26", "razina": "Premijer Ž", "comp_id": 42187, "spol": "Ž",
|
||||
"url": "https://www.hks-cbf.hr/premijer-zenska-liga/"},
|
||||
{"natj": "1.Muška liga 2025/26", "razina": "1.M liga", "comp_id": 42259, "spol": "M",
|
||||
"url": "https://www.hks-cbf.hr/prva-muska-liga/"},
|
||||
]
|
||||
|
||||
def parse_standings(html):
|
||||
"""Parse Genius Sports standings HTML table → list of rows."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
if not tables: return []
|
||||
rows_html = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[0], re.DOTALL)
|
||||
if len(rows_html) < 2: return []
|
||||
|
||||
standings = []
|
||||
for row in rows_html[1:]: # skip header
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
if len(cells) < 8: continue
|
||||
clean = [ihtml.unescape(re.sub(r'<[^>]+>', '', c)).strip() for c in cells]
|
||||
# ['1', '', 'KK ZadarZAD', '31', '58', '27', '4', '2669', '2209', '460']
|
||||
try:
|
||||
poz = int(clean[0])
|
||||
except (ValueError, IndexError): continue
|
||||
# Team has cell with name + abbreviation glued (e.g. "KK ZadarZAD")
|
||||
team_raw = clean[2]
|
||||
# Extract klub naziv - assume last 2-4 uppercase letters are abbrev
|
||||
m = re.match(r'^(.+?)([A-ZČĆŠŽĐ]{2,4})$', team_raw)
|
||||
if m:
|
||||
klub = m.group(1).strip()
|
||||
abbrev = m.group(2)
|
||||
else:
|
||||
klub = team_raw
|
||||
abbrev = None
|
||||
|
||||
try:
|
||||
standings.append({
|
||||
"poz": poz,
|
||||
"klub": klub,
|
||||
"abbrev": abbrev,
|
||||
"gp": int(clean[3]),
|
||||
"bod": int(clean[4]),
|
||||
"pob": int(clean[5]),
|
||||
"por": int(clean[6]),
|
||||
"for_pts": int(clean[7].replace(',','')),
|
||||
"ag_pts": int(clean[8].replace(',','')),
|
||||
"gd": int(clean[9].replace(',','').replace('+','')),
|
||||
"ner": 0, # košarka nema neriješeno
|
||||
})
|
||||
except (ValueError, IndexError) as e:
|
||||
continue
|
||||
return standings
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
total_inserted = 0
|
||||
pgz_klubovi_seen = set()
|
||||
|
||||
for comp in COMPS:
|
||||
print(f"\n=== {comp['natj']} (comp_id={comp['comp_id']}) ===")
|
||||
api_url = f"https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{comp['comp_id']}/standings?&iurl=https%3A%2F%2Fwww.hks-cbf.hr"
|
||||
try:
|
||||
r = requests.get(api_url, headers=HDR, timeout=20)
|
||||
d = r.json()
|
||||
html = d.get('html', '')
|
||||
except Exception as e:
|
||||
print(f" ERR fetch: {e}"); continue
|
||||
|
||||
rows = parse_standings(html)
|
||||
print(f" Parsed {len(rows)} klubova")
|
||||
|
||||
# Get/create natjecanje
|
||||
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (comp['natj'],))
|
||||
nr = cu.fetchone()
|
||||
if nr:
|
||||
natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
||||
(naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'košarka', %s, '2025/26', 'hks_genius', %s)
|
||||
RETURNING id""",
|
||||
(comp['natj'], comp['razina'], comp['url']))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
# Clear old rows
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hks_genius'", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
# Match klub
|
||||
klub_id = None
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s)
|
||||
OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='košarka' THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
if kr:
|
||||
klub_id = kr[0]
|
||||
# Check if PGŽ
|
||||
cu.execute("SELECT region, grad FROM pgz_sport.klubovi WHERE id=%s", (klub_id,))
|
||||
kdata = cu.fetchone()
|
||||
if kdata and (kdata[0] == 'PGŽ' or (kdata[1] and any(g in (kdata[1] or '').lower() for g in ['rijeka','crikv','opatija','delnice','krk','cres','rab','lošinj','losinj','vrbnik','novi vinodolski','čavle','cavle','kraljevica','kostrena','kastav']))):
|
||||
pgz_klubovi_seen.add(r['klub'])
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
||||
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
||||
gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, %s, %s, %s, %s,
|
||||
'hks_genius', %s, now(), %s::jsonb)""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['gp'], r['pob'],
|
||||
r['por'], r['for_pts'], r['ag_pts'], r['gd'], r['bod'],
|
||||
comp['url'],
|
||||
json.dumps({"abbrev": r['abbrev'], "spol": comp['spol']})))
|
||||
total_inserted += 1
|
||||
|
||||
print(f"\n=== TOTAL: {total_inserted} rows inserted ===")
|
||||
print(f"PGŽ klubovi seen: {len(pgz_klubovi_seen)} - {pgz_klubovi_seen}")
|
||||
|
||||
# Stats
|
||||
cu.execute("""SELECT n.naziv, count(t.*),
|
||||
count(*) FILTER (WHERE t.klub_id IS NOT NULL) AS matched
|
||||
FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hks_genius'
|
||||
WHERE n.source='hks_genius'
|
||||
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HKS lige stats ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
||||
|
||||
# PGŽ klubovi
|
||||
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.naziv AS db_naziv, k.aktivan
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hks_genius' AND k.region='PGŽ'
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HKS ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} -> {r[5]} '{r[6]}'")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hks_genius_scrape', 'hks_genius', NULL, %s::jsonb)""",
|
||||
(json.dumps({"inserted": total_inserted, "comps": len(COMPS), "pgz_seen": list(pgz_klubovi_seen)}),))
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+102
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
|
||||
import requests, re, html as h_unesc, psycopg2
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0"
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
r = s.get(url, timeout=15)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except: return None
|
||||
|
||||
def find_table_with_header(html_text, header_marker="Klub"):
|
||||
"""Find table that contains 'Klub' in header."""
|
||||
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
|
||||
for t in tables:
|
||||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
|
||||
if not rows: continue
|
||||
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||||
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
|
||||
if any(header_marker in h for h in h_clean):
|
||||
return rows
|
||||
return None
|
||||
|
||||
def parse_standings_rows(rows):
|
||||
"""Parse rows; expect first is header."""
|
||||
out = []
|
||||
for row in rows[1:]:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
|
||||
if len(cleaned) < 8: continue
|
||||
try:
|
||||
poz = int(cleaned[0].rstrip('.'))
|
||||
klub = cleaned[1]
|
||||
odigrano = int(cleaned[2])
|
||||
pobjede = int(cleaned[3])
|
||||
nerij = int(cleaned[4])
|
||||
porazi = int(cleaned[5])
|
||||
gz = int(cleaned[6])
|
||||
gp = int(cleaned[7])
|
||||
razl = int(cleaned[8].replace('+','').replace('−','-'))
|
||||
bod = int(cleaned[9])
|
||||
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
|
||||
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
# === Sources ===
|
||||
LIGE = [
|
||||
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
|
||||
]
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
def find_klub(naziv):
|
||||
# HNK / NK prefixes
|
||||
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
for url, naziv, razina, ext_id, sezona in LIGE:
|
||||
print(f"=== {naziv} ===")
|
||||
body = fetch(url)
|
||||
if not body: print(" fetch failed"); continue
|
||||
rows = find_table_with_header(body, "Klub")
|
||||
if not rows: print(" no table found"); continue
|
||||
parsed = parse_standings_rows(rows)
|
||||
print(f" {len(parsed)} rows parsed")
|
||||
|
||||
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
|
||||
nid = cr.fetchone()[0]
|
||||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
|
||||
|
||||
matched = 0
|
||||
for r in parsed:
|
||||
kid = find_klub(r["klub"])
|
||||
if kid: matched += 1
|
||||
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
|
||||
porazi, gol_z, gol_p, gol_razlika, bodovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
||||
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||||
print(f" matched klub_id: {matched}/{len(parsed)}")
|
||||
|
||||
# Verify Rijeka
|
||||
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
|
||||
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r}")
|
||||
|
||||
conn.close()
|
||||
+102
@@ -0,0 +1,102 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.6 — Scrape HNL + 2.HNL + 3.HNL into natjecanja_tablice."""
|
||||
import requests, re, html as h_unesc, psycopg2
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0"
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
r = s.get(url, timeout=15)
|
||||
return r.text if r.status_code == 200 else None
|
||||
except: return None
|
||||
|
||||
def find_table_with_header(html_text, header_marker="Klub"):
|
||||
"""Find table that contains 'Klub' in header."""
|
||||
tables = re.findall(r'<table[^>]*>.*?</table>', html_text, re.DOTALL)
|
||||
for t in tables:
|
||||
rows = re.findall(r'<tr[^>]*>(.*?)</tr>', t, re.DOTALL)
|
||||
if not rows: continue
|
||||
h_cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', rows[0], re.DOTALL)
|
||||
h_clean = [re.sub(r'<[^>]+>', '', c).strip() for c in h_cells]
|
||||
if any(header_marker in h for h in h_clean):
|
||||
return rows
|
||||
return None
|
||||
|
||||
def parse_standings_rows(rows):
|
||||
"""Parse rows; expect first is header."""
|
||||
out = []
|
||||
for row in rows[1:]:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
cleaned = [re.sub(r'<[^>]+>', '', h_unesc.unescape(c)).strip() for c in cells]
|
||||
if len(cleaned) < 8: continue
|
||||
try:
|
||||
poz = int(cleaned[0].rstrip('.'))
|
||||
klub = cleaned[1]
|
||||
odigrano = int(cleaned[2])
|
||||
pobjede = int(cleaned[3])
|
||||
nerij = int(cleaned[4])
|
||||
porazi = int(cleaned[5])
|
||||
gz = int(cleaned[6])
|
||||
gp = int(cleaned[7])
|
||||
razl = int(cleaned[8].replace('+','').replace('−','-'))
|
||||
bod = int(cleaned[9])
|
||||
out.append({"poz":poz,"klub":klub,"odigrano":odigrano,"pobjede":pobjede,
|
||||
"nerij":nerij,"porazi":porazi,"gz":gz,"gp":gp,"razl":razl,"bod":bod})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
# === Sources ===
|
||||
LIGE = [
|
||||
("https://hnl.hr/", "SuperSport HNL", "1.HNL", "supersport_hnl_2025_2026", "2025/2026"),
|
||||
]
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
def find_klub(naziv):
|
||||
# HNK / NK prefixes
|
||||
for q in [f"HNK {naziv}", f"NK {naziv}", naziv, f"%{naziv}%"]:
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 AND LOWER(sport) LIKE '%%nogomet%%' LIMIT 1", (q,))
|
||||
r = cr.fetchone()
|
||||
if r: return r[0]
|
||||
cr.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) ILIKE LOWER(%s) AND id != 4426 LIMIT 1", (f"%{naziv}%",))
|
||||
r = cr.fetchone()
|
||||
return r[0] if r else None
|
||||
|
||||
for url, naziv, razina, ext_id, sezona in LIGE:
|
||||
print(f"=== {naziv} ===")
|
||||
body = fetch(url)
|
||||
if not body: print(" fetch failed"); continue
|
||||
rows = find_table_with_header(body, "Klub")
|
||||
if not rows: print(" no table found"); continue
|
||||
parsed = parse_standings_rows(rows)
|
||||
print(f" {len(parsed)} rows parsed")
|
||||
|
||||
cr.execute("SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual' AND external_id=%s", (ext_id,))
|
||||
nid = cr.fetchone()[0]
|
||||
cr.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s", (nid,))
|
||||
|
||||
matched = 0
|
||||
for r in parsed:
|
||||
kid = find_klub(r["klub"])
|
||||
if kid: matched += 1
|
||||
cr.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede, nerijeseno,
|
||||
porazi, gol_z, gol_p, gol_razlika, bodovi)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
|
||||
(nid, kid, r["klub"][:200], r["poz"], r["odigrano"], r["pobjede"],
|
||||
r["nerij"], r["porazi"], r["gz"], r["gp"], r["razl"], r["bod"]))
|
||||
print(f" matched klub_id: {matched}/{len(parsed)}")
|
||||
|
||||
# Verify Rijeka
|
||||
cr.execute("""SELECT pozicija, klub_naziv, bodovi FROM pgz_sport.natjecanja_tablice
|
||||
WHERE klub_naziv ILIKE '%Rijeka%' AND natjecanje_id IN (SELECT id FROM pgz_sport.natjecanja WHERE source='hns_manual')""")
|
||||
for r in cr.fetchall():
|
||||
print(f" {r}")
|
||||
|
||||
conn.close()
|
||||
Executable
+173
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HNS Semafor ligaški scraper - parses body.innerText (SPA, no <table> tags)
|
||||
Path: /opt/pgz-sport/scrapers/hns_lige_standings.py
|
||||
Author: Damir Radulić / Ri.NET
|
||||
Source: https://semafor.hns.family/natjecanja/...
|
||||
Output: pgz_sport.natjecanja_tablice (source=hns_semafor)
|
||||
Run: python3 hns_lige_standings.py
|
||||
"""
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
LIGE = [
|
||||
{"natj": "Supersport HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100391485/supersport-hnl", "razina": "1.HNL"},
|
||||
{"natj": "2.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100413651/2-hnl", "razina": "2.HNL"},
|
||||
{"natj": "3.HNL 2025/26", "url": "https://semafor.hns.family/natjecanja/100418001/3-hnl", "razina": "3.HNL"},
|
||||
{"natj": "Magenta 1.HNL Juniori 2025/26", "url": "https://semafor.hns.family/natjecanja/100511683/magenta-1-hnl-juniori", "razina": "Juniori"},
|
||||
{"natj": "1.HNKŽ 2025/26", "url": "https://semafor.hns.family/natjecanja/100914995/1-hnk%C5%BE", "razina": "Žene 1.razred"},
|
||||
]
|
||||
|
||||
async def scrape_one(page, liga):
|
||||
print(f"\n=== {liga['natj']} ===", flush=True)
|
||||
try:
|
||||
await page.goto(liga['url'], wait_until="domcontentloaded", timeout=30000)
|
||||
await page.wait_for_timeout(2000)
|
||||
try:
|
||||
btn = await page.query_selector(".cky-btn-accept")
|
||||
if btn:
|
||||
await btn.click()
|
||||
await page.wait_for_timeout(1500)
|
||||
except: pass
|
||||
await page.wait_for_timeout(4000)
|
||||
|
||||
body_text = await page.evaluate("() => document.body.innerText")
|
||||
idx = body_text.find('\nBod\n')
|
||||
if idx < 0:
|
||||
idx = body_text.find('\nBod ')
|
||||
if idx < 0:
|
||||
print(" No Ljestvica found"); return []
|
||||
|
||||
ljestvica_text = body_text[idx+5:idx+5+8000]
|
||||
lines = [l.strip() for l in ljestvica_text.split('\n') if l.strip()]
|
||||
# Filter out form indicators (P=poraz, N=ne, X=neutral, V=victory single chars)
|
||||
lines = [l for l in lines if l not in ('P', 'N', 'X', 'V', 'D', 'W', 'L', 'F', 'Forma')]
|
||||
|
||||
rows = []
|
||||
i = 0
|
||||
while i < len(lines):
|
||||
try:
|
||||
poz = int(lines[i])
|
||||
if poz > 50 or poz < 1:
|
||||
i += 1; continue
|
||||
klub = lines[i+1]
|
||||
if klub.isdigit() or len(klub) < 3:
|
||||
i += 1; continue
|
||||
uk = int(lines[i+2])
|
||||
pob = int(lines[i+3])
|
||||
ner = int(lines[i+4])
|
||||
por = int(lines[i+5])
|
||||
gp = int(lines[i+6])
|
||||
gpr = int(lines[i+7])
|
||||
gr_raw = lines[i+8].strip()
|
||||
gr = int(gr_raw.replace('+',''))
|
||||
bod = int(lines[i+9])
|
||||
rows.append({
|
||||
"poz": poz, "klub": klub, "uk": uk, "pob": pob, "ner": ner, "por": por,
|
||||
"gp": gp, "gpr": gpr, "gr": gr, "bod": bod
|
||||
})
|
||||
i += 10
|
||||
except (ValueError, IndexError):
|
||||
i += 1
|
||||
|
||||
print(f" Parsed {len(rows)} klubova")
|
||||
for r in rows[:3]:
|
||||
print(f" {r['poz']:>2}. {r['klub']:<25} {r['bod']:>3} bod, GR={r['gr']:+}")
|
||||
return rows
|
||||
except Exception as e:
|
||||
print(f" ERR: {e}")
|
||||
return []
|
||||
|
||||
async def run():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
ctx = await browser.new_context(user_agent="Mozilla/5.0 Chrome/120 Safari/537.36")
|
||||
page = await ctx.new_page()
|
||||
|
||||
all_inserted = 0
|
||||
for liga in LIGE:
|
||||
rows = await scrape_one(page, liga)
|
||||
if not rows: continue
|
||||
|
||||
cu.execute("""SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1""", (liga['natj'],))
|
||||
nr = cu.fetchone()
|
||||
if nr:
|
||||
natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja
|
||||
(naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'nogomet', %s, '2025/26', 'hns_semafor', %s)
|
||||
RETURNING id""",
|
||||
(liga['natj'], liga['razina'], liga['url']))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
# Clear old rows for this natjecanje (no sezona col)
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hns_semafor'",
|
||||
(natj_id,))
|
||||
|
||||
for r in rows:
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s)
|
||||
OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='nogomet' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
klub_id = kr[0] if kr else None
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija,
|
||||
odigrano, pobjede, nerijeseno, porazi, gol_z, gol_p,
|
||||
gol_razlika, bodovi, source, source_url, updated_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s,
|
||||
'hns_semafor', %s, now())""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['ner'],
|
||||
r['por'], r['gp'], r['gpr'], r['gr'], r['bod'], liga['url']))
|
||||
all_inserted += 1
|
||||
|
||||
await browser.close()
|
||||
|
||||
print(f"\n=== TOTAL inserted: {all_inserted} rows ===")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hns_lige_scrape', 'hns_semafor', NULL, %s::jsonb)""",
|
||||
(f'{{"inserted":{all_inserted},"lige":{len(LIGE)}}}',))
|
||||
|
||||
cu.execute("""SELECT n.naziv, count(t.*) FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t
|
||||
ON n.id=t.natjecanje_id AND t.source='hns_semafor'
|
||||
WHERE n.source='hns_semafor' AND n.sezona='2025/26'
|
||||
GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HNS lige stats ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova - {r[0]}")
|
||||
|
||||
# PGŽ klubovi u tablicama
|
||||
cu.execute("""SELECT n.naziv AS liga, t.pozicija, t.klub_naziv, t.bodovi, t.gol_razlika, k.id, k.aktivan
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hns_semafor'
|
||||
AND (lower(t.klub_naziv) LIKE '%%rijeka%%' OR lower(t.klub_naziv) LIKE '%%opatija%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%krk%%' OR lower(t.klub_naziv) LIKE '%%delnice%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%zamet%%' OR lower(t.klub_naziv) LIKE '%%orijent%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%cresnik%%' OR lower(t.klub_naziv) LIKE '%%goranin%%'
|
||||
OR lower(t.klub_naziv) LIKE '%%kvarner%%')
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HNS ligama 2025/26 ===")
|
||||
for r in cu.fetchall():
|
||||
match = f"klub_id={r[5]}" if r[5] else "❌ no match"
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b GR{r[4]:+} {match}")
|
||||
|
||||
conn.close()
|
||||
|
||||
asyncio.run(run())
|
||||
Executable
+608
@@ -0,0 +1,608 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HNS Semafor scraper for PGŽ football clubs.
|
||||
|
||||
Strategy:
|
||||
1. Seed-map known PGŽ clubs to HNS COMET klub_id (manual list to start)
|
||||
2. For each klub: fetch /klubovi/{id}/{slug}/ and extract roster (player list)
|
||||
3. For each player: fetch /igraci/{id}/{slug}/ → store in clanovi + utakmice_log
|
||||
4. Respect rate limit (1 req / 1.5s), record run in scraper_runs
|
||||
|
||||
Run modes:
|
||||
python hns_semafor.py seed # set hns_klub_id for known clubs
|
||||
python hns_semafor.py klub <db_klub_id> # scrape one klub roster + players
|
||||
python hns_semafor.py player <hns_pid> # scrape one player
|
||||
python hns_semafor.py daily # full daily harvest of seeded PGŽ clubs
|
||||
"""
|
||||
import os, re, sys, time, json, logging
|
||||
from datetime import datetime, date
|
||||
from urllib.parse import urljoin
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
BASE = "https://semafor.hns.family"
|
||||
UA = "PGZ-Sport-Bot/1.0 (+https://api.rinet.one/sport/; legitimni interes; analitika sporta PGZ)"
|
||||
RATE_S = 1.6 # seconds between requests
|
||||
TIMEOUT = 25
|
||||
|
||||
log = logging.getLogger("hns")
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
level=logging.INFO,
|
||||
handlers=[
|
||||
logging.FileHandler("/opt/pgz-sport/_logs/hns_scraper.log"),
|
||||
logging.StreamHandler(sys.stdout),
|
||||
],
|
||||
)
|
||||
|
||||
# ═══ Manual seed mapping — PGŽ klubovi → HNS COMET id ═══
|
||||
# Discovered from semafor.hns.family/igraci/1167145/marko-komadina/ matches
|
||||
SEED_MAP = {
|
||||
# naziv → hns_klub_id
|
||||
"NK Klana": 1569,
|
||||
"NK Krk": 1558,
|
||||
"NK Mune": 1576,
|
||||
"NK Vihor": 4326,
|
||||
"NK Doker": 107415,
|
||||
"HNK Kozala": 3090,
|
||||
"HNK Lovran": 1574,
|
||||
"HNK Goranin": 1565,
|
||||
"NK Risnjak": 1583,
|
||||
"NK Lokomotiva": 1570,
|
||||
"NK Omladinac Vrata": 1579,
|
||||
"NK Draga": 1554,
|
||||
"NK Zamet": 1589,
|
||||
"NK Vrbovsko": 1588,
|
||||
"NK Rikard Benčić": 1582,
|
||||
"NK OŠK Omišalj": 3071,
|
||||
}
|
||||
|
||||
def conn():
|
||||
return psycopg2.connect(**DB)
|
||||
|
||||
def fetch(url: str) -> str:
|
||||
log.info(f"GET {url}")
|
||||
r = requests.get(url, headers={"User-Agent": UA}, timeout=TIMEOUT)
|
||||
r.raise_for_status()
|
||||
time.sleep(RATE_S)
|
||||
return r.text
|
||||
|
||||
def slugify(s: str) -> str:
|
||||
s = s.lower().strip()
|
||||
s = re.sub(r'[čćš]', lambda m: {'č':'c','ć':'c','š':'s'}[m.group()], s)
|
||||
s = re.sub(r'[žđ]', lambda m: {'ž':'z','đ':'d'}[m.group()], s)
|
||||
s = re.sub(r'[^a-z0-9]+', '-', s).strip('-')
|
||||
return s
|
||||
|
||||
def cmd_seed():
|
||||
"""Map SEED_MAP to klubovi.hns_klub_id where naziv matches; auto-INSERT if missing."""
|
||||
n_updated = 0; n_inserted = 0
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
for naziv, hns_id in SEED_MAP.items():
|
||||
cu.execute("""UPDATE pgz_sport.klubovi
|
||||
SET hns_klub_id=%s, hns_slug=%s, source_synced_at=now()
|
||||
WHERE naziv ILIKE %s AND sport='nogomet'
|
||||
AND (hns_klub_id IS NULL OR hns_klub_id=%s)""",
|
||||
(hns_id, slugify(naziv), f"%{naziv}%", hns_id))
|
||||
if cu.rowcount > 0:
|
||||
n_updated += cu.rowcount
|
||||
continue
|
||||
# Try by hns_klub_id directly (already set elsewhere)
|
||||
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (hns_id,))
|
||||
if cu.fetchone():
|
||||
continue
|
||||
# Insert new minimal row
|
||||
cu.execute("""INSERT INTO pgz_sport.klubovi
|
||||
(naziv, sport, razina, hns_klub_id, hns_slug, aktivan, region,
|
||||
source_synced_at, napomena)
|
||||
VALUES (%s,'nogomet','3.HRL',%s,%s,true,'PGŽ',now(),
|
||||
'Auto-seeded from HNS Semafor (legitimni interes — analitika)')""",
|
||||
(naziv, hns_id, slugify(naziv)))
|
||||
n_inserted += 1
|
||||
c.commit()
|
||||
log.info(f"Seed: updated={n_updated}, inserted={n_inserted}")
|
||||
return {"updated": n_updated, "inserted": n_inserted}
|
||||
|
||||
def parse_player_profile(hns_pid: int, html: str) -> dict:
|
||||
"""Parse /igraci/{id}/{slug}/ → dict."""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
data = {"hns_pid": hns_pid, "matches": []}
|
||||
|
||||
# Name in first <h1>
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
data['ime_prezime'] = h1.get_text(' ', strip=True)
|
||||
|
||||
# Photo
|
||||
img = soup.find('img', alt=data.get('ime_prezime', ''))
|
||||
if img and img.get('src'):
|
||||
data['slika_url'] = img['src']
|
||||
|
||||
# Trenutni klub — find h4 with link (klub heading)
|
||||
klub_link = soup.find('a', href=re.compile(r'/klubovi/(\d+)/'))
|
||||
if klub_link:
|
||||
m = re.search(r'/klubovi/(\d+)/', klub_link['href'])
|
||||
if m: data['trenutni_klub_hns_id'] = int(m.group(1))
|
||||
h = klub_link.find('h4')
|
||||
if h: data['trenutni_klub'] = h.get_text(' ', strip=True)
|
||||
|
||||
# Datum rođenja - targetira <li class="dob"> direktno
|
||||
li_dob = soup.find('li', class_='dob')
|
||||
if li_dob:
|
||||
h4 = li_dob.find('h4')
|
||||
if h4:
|
||||
t = h4.get_text(' ', strip=True)
|
||||
data['datum_rodenja_raw'] = t
|
||||
m = re.match(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', t)
|
||||
if m:
|
||||
try:
|
||||
d = m.groups()
|
||||
data['datum_rodenja'] = date(int(d[2]), int(d[1]), int(d[0])).isoformat()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Mjesto rođenja - targetira <li class="pob">
|
||||
li_pob = soup.find('li', class_='pob')
|
||||
if li_pob:
|
||||
h4_m = li_pob.find('h4')
|
||||
if h4_m:
|
||||
data['mjesto_rodenja'] = h4_m.get_text(strip=True)
|
||||
|
||||
# Stara fallback metoda - h4 followed by h3 "Mjesto rođenja"
|
||||
for h3 in soup.find_all('h3'):
|
||||
if 'Mjesto rođenja' in h3.get_text():
|
||||
prev = h3.find_previous('h4')
|
||||
if prev: data['mjesto_rodenja'] = prev.get_text(strip=True)
|
||||
|
||||
return data
|
||||
|
||||
def upsert_player(klub_id_db: int, prof: dict) -> int:
|
||||
"""Upsert clanovi row from parsed profile, return clan_id."""
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
# Try find existing by source_id
|
||||
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE source='hns_semafor' AND source_id=%s""", (str(prof['hns_pid']),))
|
||||
row = cu.fetchone()
|
||||
ime, *prezime = (prof.get('ime_prezime','') or '').split(' ', 1)
|
||||
prezime = prezime[0] if prezime else ''
|
||||
url = f"{BASE}/igraci/{prof['hns_pid']}/{slugify(prof.get('ime_prezime',''))}/"
|
||||
if row:
|
||||
cid = row[0]
|
||||
cu.execute("""UPDATE pgz_sport.clanovi
|
||||
SET ime=%s, prezime=%s, datum_rodenja=%s, mjesto_rodenja=%s,
|
||||
slika_url=%s, klub_id=%s, source_url=%s, source_synced_at=now()
|
||||
WHERE id=%s""",
|
||||
(ime, prezime, prof.get('datum_rodenja'), prof.get('mjesto_rodenja'),
|
||||
prof.get('slika_url'), klub_id_db, url, cid))
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, datum_rodenja, mjesto_rodenja, slika_url,
|
||||
source, source_id, source_url, source_synced_at, slug)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
|
||||
RETURNING id""",
|
||||
(klub_id_db, ime, prezime, prof.get('datum_rodenja'),
|
||||
prof.get('mjesto_rodenja'), prof.get('slika_url'),
|
||||
str(prof['hns_pid']), url, slugify(prof.get('ime_prezime',''))))
|
||||
cid = cu.fetchone()[0]
|
||||
c.commit()
|
||||
return cid
|
||||
|
||||
def cmd_player(hns_pid: int, klub_id_db: int = None):
|
||||
"""Scrape a single player by HNS ID."""
|
||||
if klub_id_db is None:
|
||||
# try to infer from current klub via DB if previously stored
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("""SELECT klub_id FROM pgz_sport.clanovi
|
||||
WHERE source='hns_semafor' AND source_id=%s""", (str(hns_pid),))
|
||||
r = cu.fetchone()
|
||||
if r: klub_id_db = r[0]
|
||||
|
||||
url = f"{BASE}/igraci/{hns_pid}/dummy/" # slug is forgiving; HNS redirects
|
||||
html = fetch(url)
|
||||
prof = parse_player_profile(hns_pid, html)
|
||||
log.info(f"Parsed: {prof.get('ime_prezime','?')} (HNS#{hns_pid}) klub={prof.get('trenutni_klub','?')}")
|
||||
|
||||
# Resolve current_klub_hns_id → klub_id_db if not provided
|
||||
if klub_id_db is None and prof.get('trenutni_klub_hns_id'):
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id=%s", (prof['trenutni_klub_hns_id'],))
|
||||
r = cu.fetchone()
|
||||
if r: klub_id_db = r[0]
|
||||
|
||||
if klub_id_db is None:
|
||||
log.warning(f"No DB klub_id for HNS player {hns_pid} — skipping upsert")
|
||||
return None
|
||||
|
||||
return upsert_player(klub_id_db, prof)
|
||||
|
||||
def cmd_daily():
|
||||
"""Refresh seeded clubs and their rosters (pull from sample player). To be expanded."""
|
||||
run_id = None
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("""INSERT INTO pgz_sport.scraper_runs (source, scope)
|
||||
VALUES ('hns_semafor','daily') RETURNING id""")
|
||||
run_id = cu.fetchone()[0]; c.commit()
|
||||
|
||||
inserted = 0; updated = 0; errors = []
|
||||
try:
|
||||
# Phase 1: ensure seed mapping is current
|
||||
cmd_seed()
|
||||
log.info("=== Daily HNS harvest start ===")
|
||||
# TODO: roster discovery requires per-klub roster page. For now, only re-fetch known players.
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("""SELECT source_id FROM pgz_sport.clanovi
|
||||
WHERE source='hns_semafor' ORDER BY source_synced_at NULLS FIRST LIMIT 500""")
|
||||
pids = [r[0] for r in cu.fetchall()]
|
||||
for pid in pids:
|
||||
try:
|
||||
cmd_player(int(pid))
|
||||
updated += 1
|
||||
except Exception as e:
|
||||
log.error(f"player {pid}: {e}")
|
||||
errors.append({"pid": pid, "err": str(e)})
|
||||
log.info(f"=== Daily done: updated={updated} errors={len(errors)} ===")
|
||||
finally:
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("""UPDATE pgz_sport.scraper_runs
|
||||
SET finished_at=now(), status=%s, rows_updated=%s, errors=%s::jsonb, rows_inserted=%s
|
||||
WHERE id=%s""",
|
||||
("ok" if not errors else "partial", updated, json.dumps(errors), inserted, run_id))
|
||||
c.commit()
|
||||
|
||||
|
||||
def parse_match(html, match_url=None):
|
||||
"""HNS match parser v4 — uses precise class signals.
|
||||
|
||||
Player <li class='row match_lineup' data-personid='87561'>:
|
||||
<div class='shirtNumber'>9</div>
|
||||
<div class='playerPhoto'><div class='photo'><img src='...' /></div></div>
|
||||
<div class='playerName'><h3><a href='/igraci/.../'>Ivan Laginja</a></h3>Igrač</div>
|
||||
<div class='matchEvents'>
|
||||
<ul class='events'>
|
||||
<li class='goal'><div class='icon' title='Gol'></div>40'</li>
|
||||
<li class='substitutionOut'><div class='icon' title='Izmjena'></div>87'</li>
|
||||
<li class='yellow'>...</li>
|
||||
<li class='red'>...</li>
|
||||
<li class='ownGoal'>...</li>
|
||||
<li class='substitutionIn'>...</li>
|
||||
</ul>
|
||||
</div>
|
||||
"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
out = {"teams": {}, "match_url": match_url, "meta": {}, "title": ""}
|
||||
|
||||
h1 = soup.find('h1')
|
||||
out['title'] = h1.get_text(' ', strip=True) if h1 else ''
|
||||
|
||||
EVENT_KIND_MAP = {
|
||||
'goal': 'gol',
|
||||
'ownGoal': 'autogol',
|
||||
'penaltyGoal': 'gol',
|
||||
'yellow': 'zuti',
|
||||
'secondYellow': 'zuti2', # second yellow → effectively red
|
||||
'red': 'crveni',
|
||||
'substitutionIn': 'subIn',
|
||||
'substitutionOut': 'subOut',
|
||||
}
|
||||
|
||||
def parse_team_div(team_div):
|
||||
if not team_div: return None, []
|
||||
ul = team_div.find('ul', recursive=False)
|
||||
if not ul: ul = team_div.find('ul')
|
||||
if not ul: return None, []
|
||||
team_name = None
|
||||
players = []
|
||||
is_starter = True
|
||||
for li in ul.find_all('li', recursive=False):
|
||||
cls = li.get('class') or []
|
||||
if 'header' in cls and 'clubName' in cls:
|
||||
team_name = li.get_text(' ', strip=True)
|
||||
continue
|
||||
if 'header' in cls and 'separatorTitle' in cls:
|
||||
if 'Pričuvni' in li.get_text(' ', strip=True):
|
||||
is_starter = False
|
||||
continue
|
||||
if not ('row' in cls and 'match_lineup' in cls):
|
||||
continue
|
||||
|
||||
# Player extraction
|
||||
pid = li.get('data-personid')
|
||||
if not pid:
|
||||
a = li.find('a', href=re.compile(r'/igraci/(\d+)/'))
|
||||
if not a: continue
|
||||
pm = re.search(r'/igraci/(\d+)/', a['href'])
|
||||
pid = pm.group(1)
|
||||
try: pid = int(pid)
|
||||
except: continue
|
||||
|
||||
# Shirt number
|
||||
sn = li.find('div', class_='shirtNumber')
|
||||
broj_dresa = None
|
||||
if sn:
|
||||
bs = sn.get_text(' ', strip=True).strip()
|
||||
if bs.isdigit(): broj_dresa = int(bs)
|
||||
|
||||
# Image
|
||||
img = li.find('img')
|
||||
slika = img.get('src') if img else None
|
||||
|
||||
# Name + position
|
||||
pn = li.find('div', class_='playerName')
|
||||
ime_prezime = ''
|
||||
pozicija = None
|
||||
captain = False
|
||||
if pn:
|
||||
a2 = pn.find('a')
|
||||
if a2:
|
||||
ime_prezime = a2.get_text(' ', strip=True)
|
||||
# Position is text after <h3>
|
||||
full = pn.get_text(' ', strip=True)
|
||||
rest = full.replace(ime_prezime, '').strip()
|
||||
if '(C)' in rest: captain = True
|
||||
rest = rest.replace('(C)', '').strip()
|
||||
if 'Vratar' in rest: pozicija = 'Vratar'
|
||||
elif 'Igrač' in rest: pozicija = 'Igrač'
|
||||
|
||||
# Events
|
||||
events = []
|
||||
me_div = li.find('div', class_='matchEvents')
|
||||
if me_div:
|
||||
ev_ul = me_div.find('ul', class_='events')
|
||||
if ev_ul:
|
||||
for ev_li in ev_ul.find_all('li', recursive=False):
|
||||
ev_cls = ev_li.get('class') or []
|
||||
kind = None
|
||||
for k in ev_cls:
|
||||
if k in EVENT_KIND_MAP:
|
||||
kind = EVENT_KIND_MAP[k]; break
|
||||
text = ev_li.get_text(' ', strip=True)
|
||||
mm = re.search(r"(\d+(?:\+\d+)?)\s*'", text)
|
||||
minute = mm.group(1) if mm else None
|
||||
if kind:
|
||||
events.append({'kind': kind, 'minute': minute})
|
||||
|
||||
# Aggregate counts
|
||||
cnt_gol = sum(1 for e in events if e['kind'] in ('gol',))
|
||||
cnt_zuti = sum(1 for e in events if e['kind'] == 'zuti')
|
||||
cnt_crveni = sum(1 for e in events if e['kind'] in ('crveni','zuti2')) # 2nd yellow = red
|
||||
|
||||
# Substitution minutes (in/out)
|
||||
sub_in_min = next((e['minute'] for e in events if e['kind']=='subIn'), None)
|
||||
sub_out_min = next((e['minute'] for e in events if e['kind']=='subOut'), None)
|
||||
|
||||
# Estimate minutes played
|
||||
minutes = None
|
||||
if is_starter:
|
||||
if sub_out_min:
|
||||
try: minutes = int(re.sub(r'[^\d]','', sub_out_min))
|
||||
except: pass
|
||||
else:
|
||||
minutes = 90 # full game
|
||||
else: # bench
|
||||
if sub_in_min:
|
||||
try: minutes = max(0, 90 - int(re.sub(r'[^\d]','', sub_in_min)))
|
||||
except: pass
|
||||
else:
|
||||
minutes = 0 # never came on
|
||||
|
||||
players.append({
|
||||
'hns_pid': pid,
|
||||
'ime_prezime': ime_prezime,
|
||||
'broj_dresa': broj_dresa,
|
||||
'pozicija': pozicija,
|
||||
'slika_url': slika,
|
||||
'captain': captain,
|
||||
'starter': is_starter,
|
||||
'events': events,
|
||||
'pogodaka': cnt_gol,
|
||||
'zuti_kartoni': cnt_zuti,
|
||||
'crveni_kartoni': cnt_crveni,
|
||||
'minute': minutes,
|
||||
})
|
||||
return team_name, players
|
||||
|
||||
home_div = soup.find('div', class_='homeTeam')
|
||||
away_div = soup.find('div', class_='awayTeam')
|
||||
|
||||
home_name, home_players = parse_team_div(home_div)
|
||||
away_name, away_players = parse_team_div(away_div)
|
||||
|
||||
if home_name: out['teams'][home_name] = home_players
|
||||
if away_name: out['teams'][away_name] = away_players
|
||||
|
||||
# Logo URLs
|
||||
if home_div:
|
||||
h_img = home_div.find('img')
|
||||
out['meta']['klub_dom_logo'] = h_img.get('src') if h_img else None
|
||||
if away_div:
|
||||
a_img = away_div.find('img')
|
||||
out['meta']['klub_gost_logo'] = a_img.get('src') if a_img else None
|
||||
|
||||
out['meta']['klub_dom'] = home_name
|
||||
out['meta']['klub_gost'] = away_name
|
||||
|
||||
# Date/time, viewership, score, competition
|
||||
body_text = soup.get_text(' ', strip=True)
|
||||
dm = re.search(r'(\d{1,2}\.\d{1,2}\.\d{4})\.?\s*(\d{1,2}:\d{2})', body_text)
|
||||
if dm:
|
||||
try:
|
||||
d_parts = dm.group(1).split('.')
|
||||
out['meta']['datum'] = f"{d_parts[2]}-{d_parts[1].zfill(2)}-{d_parts[0].zfill(2)}"
|
||||
out['meta']['vrijeme'] = dm.group(2)
|
||||
except: pass
|
||||
gm = re.search(r'Gledatelja:\s*(\d+)', body_text)
|
||||
if gm: out['meta']['gledatelja'] = int(gm.group(1))
|
||||
|
||||
rm = re.search(r'(\d+):(\d+)', out.get('title',''))
|
||||
if rm:
|
||||
out['meta']['rezultat'] = f"{rm.group(1)}:{rm.group(2)}"
|
||||
|
||||
nat_match = out.get('title','').split(',')
|
||||
if len(nat_match) > 1: out['meta']['natjecanje'] = nat_match[-1].strip()
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def cmd_klub(klub_id_db: int, max_matches: int = 999):
|
||||
"""Scrape klub: club page → all matches → for our team upsert player + utakmice_log row with full stats."""
|
||||
with conn() as c:
|
||||
cu = c.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cu.execute("SELECT id, naziv, hns_klub_id, hns_slug FROM pgz_sport.klubovi WHERE id=%s", (klub_id_db,))
|
||||
klub = cu.fetchone()
|
||||
if not klub or not klub['hns_klub_id']:
|
||||
log.error(f"Klub #{klub_id_db}: nema hns_klub_id"); return 0
|
||||
|
||||
klub_url = f"{BASE}/klubovi/{klub['hns_klub_id']}/{klub['hns_slug'] or 'k'}/"
|
||||
log.info(f"Klub: {klub['naziv']} → {klub_url}")
|
||||
try: html = fetch(klub_url)
|
||||
except Exception as e: log.error(f"klub fetch failed: {e}"); return 0
|
||||
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
match_ids = []
|
||||
for a in soup.find_all('a', href=re.compile(r'/utakmice/(\d+)/')):
|
||||
mm = re.search(r'/utakmice/(\d+)/', a['href'])
|
||||
if mm and mm.group(1) not in match_ids:
|
||||
match_ids.append(mm.group(1))
|
||||
log.info(f" found {len(match_ids)} matches; processing up to {max_matches}")
|
||||
|
||||
klub_naziv_low = klub['naziv'].lower()
|
||||
seen_pids = set()
|
||||
matches_logged = 0
|
||||
|
||||
for mid in match_ids[:max_matches]:
|
||||
try:
|
||||
mhtml = fetch(f"{BASE}/utakmice/{mid}/")
|
||||
md = parse_match(mhtml, match_url=f"{BASE}/utakmice/{mid}/")
|
||||
except Exception as e: log.error(f" match {mid}: {e}"); continue
|
||||
|
||||
if not md.get('teams'):
|
||||
log.warning(f" match {mid}: no teams parsed"); continue
|
||||
|
||||
# Find which team (home or away) is OURS — use looser match (incl. token overlap)
|
||||
roster = []; matched_team = None
|
||||
for tn, players in md['teams'].items():
|
||||
tn_low = tn.lower()
|
||||
# try exact substring both directions
|
||||
if klub_naziv_low in tn_low or tn_low in klub_naziv_low:
|
||||
roster = players; matched_team = tn; break
|
||||
# token-set overlap (e.g. "NK Krk" vs "NK Krk Krk" or "NK Vihor" vs "NK Vihor (B)")
|
||||
tokens_klub = set(re.split(r'\s+', re.sub(r'[^\w]',' ', klub_naziv_low)))
|
||||
tokens_team = set(re.split(r'\s+', re.sub(r'[^\w]',' ', tn_low)))
|
||||
tokens_klub.discard(''); tokens_team.discard('')
|
||||
common = tokens_klub & tokens_team
|
||||
# Drop generic tokens
|
||||
generic = {'nk','hnk','klub','nogometni'}
|
||||
common_strong = common - generic
|
||||
if len(common_strong) >= 1 and (klub_naziv_low.split()[-1] in tn_low or tn_low.split()[-1] in klub_naziv_low):
|
||||
roster = players; matched_team = tn
|
||||
log.info(f" fuzzy match: {klub['naziv']} ↔ {tn}")
|
||||
break
|
||||
|
||||
if not roster:
|
||||
continue # silently skip non-matching
|
||||
|
||||
meta = md.get('meta', {})
|
||||
team_keys = list(md['teams'].keys())
|
||||
klub_dom = team_keys[0] if team_keys else None
|
||||
klub_gost = team_keys[1] if len(team_keys) > 1 else None
|
||||
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
for pl in roster:
|
||||
if not pl.get('hns_pid'): continue
|
||||
seen_pids.add(pl['hns_pid'])
|
||||
|
||||
name = pl['ime_prezime'] or ''
|
||||
parts = name.rsplit(' ', 1)
|
||||
ime = parts[0] if len(parts) > 1 else name
|
||||
prezime = parts[1] if len(parts) > 1 else ''
|
||||
slug = slugify(name)
|
||||
src_url = f"{BASE}/igraci/{pl['hns_pid']}/{slug}/"
|
||||
|
||||
cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE source='hns_semafor' AND source_id=%s""", (str(pl['hns_pid']),))
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
cid = row[0]
|
||||
cu.execute("""UPDATE pgz_sport.clanovi
|
||||
SET ime=%s, prezime=%s, slika_url=COALESCE(NULLIF(%s,''), slika_url),
|
||||
broj_dresa=COALESCE(%s, broj_dresa),
|
||||
pozicija=COALESCE(%s, pozicija),
|
||||
klub_id=%s, source_url=%s, source_synced_at=now(), slug=%s
|
||||
WHERE id=%s""",
|
||||
(ime, prezime, pl.get('slika_url') or '', pl.get('broj_dresa'),
|
||||
pl.get('pozicija'), klub_id_db, src_url, slug, cid))
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.clanovi
|
||||
(klub_id, ime, prezime, slika_url, broj_dresa, pozicija,
|
||||
source, source_id, source_url, source_synced_at, slug)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,'hns_semafor',%s,%s,now(),%s)
|
||||
RETURNING id""",
|
||||
(klub_id_db, ime, prezime, pl.get('slika_url'), pl.get('broj_dresa'),
|
||||
pl.get('pozicija'), str(pl['hns_pid']), src_url, slug))
|
||||
cid = cu.fetchone()[0]
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.utakmice_log
|
||||
(clan_id, source, source_match_id, source_url, datum, vrijeme,
|
||||
natjecanje, klub_dom, klub_dom_logo, klub_gost, klub_gost_logo, rezultat, za_klub_id,
|
||||
pogodaka, zuti_kartoni, crveni_kartoni, minute, zapocet_kao_starter)
|
||||
VALUES (%s,'hns_semafor',%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||
ON CONFLICT (source, source_match_id, clan_id) DO UPDATE SET
|
||||
datum=EXCLUDED.datum, rezultat=EXCLUDED.rezultat,
|
||||
za_klub_id=EXCLUDED.za_klub_id,
|
||||
pogodaka=EXCLUDED.pogodaka, zuti_kartoni=EXCLUDED.zuti_kartoni,
|
||||
crveni_kartoni=EXCLUDED.crveni_kartoni, minute=EXCLUDED.minute,
|
||||
zapocet_kao_starter=EXCLUDED.zapocet_kao_starter,
|
||||
klub_dom_logo=EXCLUDED.klub_dom_logo, klub_gost_logo=EXCLUDED.klub_gost_logo""",
|
||||
(cid, mid, f"{BASE}/utakmice/{mid}/",
|
||||
meta.get('datum'), meta.get('vrijeme'),
|
||||
meta.get('natjecanje'), klub_dom, meta.get('klub_dom_logo'),
|
||||
klub_gost, meta.get('klub_gost_logo'),
|
||||
meta.get('rezultat'), klub_id_db,
|
||||
pl.get('pogodaka',0), pl.get('zuti_kartoni',0),
|
||||
pl.get('crveni_kartoni',0), pl.get('minute'),
|
||||
pl.get('starter', True)))
|
||||
c.commit()
|
||||
matches_logged += 1
|
||||
|
||||
log.info(f"Klub {klub['naziv']} done: {len(seen_pids)} unique players, {matches_logged} matches logged")
|
||||
return len(seen_pids)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
print(__doc__); sys.exit(1)
|
||||
cmd = sys.argv[1]
|
||||
if cmd == 'seed':
|
||||
print(cmd_seed())
|
||||
elif cmd == 'player':
|
||||
cid = cmd_player(int(sys.argv[2]))
|
||||
print(f"clan_id={cid}")
|
||||
elif cmd == 'daily':
|
||||
cmd_daily()
|
||||
elif cmd == 'klub':
|
||||
if len(sys.argv) < 3:
|
||||
print("Usage: klub <db_klub_id> [max_matches]"); sys.exit(2)
|
||||
max_m = int(sys.argv[3]) if len(sys.argv) > 3 else 1
|
||||
cmd_klub(int(sys.argv[2]), max_matches=max_m)
|
||||
elif cmd == 'klub_all':
|
||||
# Scrape all PGŽ klubovi with hns_klub_id set
|
||||
with conn() as c:
|
||||
cu = c.cursor()
|
||||
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE hns_klub_id IS NOT NULL ORDER BY id")
|
||||
kids = [r[0] for r in cu.fetchall()]
|
||||
log.info(f"Scraping rosters for {len(kids)} klubova…")
|
||||
for kid in kids:
|
||||
try: cmd_klub(kid, max_matches=999)
|
||||
except Exception as e: log.error(f"klub {kid}: {e}")
|
||||
else:
|
||||
print(f"Unknown: {cmd}"); sys.exit(2)
|
||||
Executable
+168
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
|
||||
Path: /opt/pgz-sport/scrapers/hos_scraper.py
|
||||
"""
|
||||
import requests, re, json, psycopg2, html as ihtml
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
def parse_table(html, table_idx):
|
||||
"""Parse a single table - return rows."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
if table_idx >= len(tables): return []
|
||||
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
|
||||
out = []
|
||||
for row in rows:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
|
||||
if not clean or not clean[0]: continue
|
||||
# Skip header
|
||||
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
|
||||
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
|
||||
try:
|
||||
poz_match = re.match(r'(\d+)', clean[0])
|
||||
if not poz_match: continue
|
||||
poz = int(poz_match.group(1))
|
||||
if len(clean) < 6: continue
|
||||
klub = clean[2] if clean[2] else clean[1]
|
||||
if not klub: continue
|
||||
out.append({
|
||||
"poz": poz,
|
||||
"klub": klub,
|
||||
"uk": int(clean[3]),
|
||||
"pob": int(clean[4]),
|
||||
"por": int(clean[5]),
|
||||
"bod": int(clean[6]) if len(clean) > 6 else 0,
|
||||
"ner": 0,
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
def find_table_titles(html):
|
||||
"""Find h2/h3 headers + their position to associate with following tables."""
|
||||
# Use regex to find heading + nearest table
|
||||
out = []
|
||||
# Finds positions of titles
|
||||
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
|
||||
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
|
||||
if title and len(title) > 5:
|
||||
out.append((m.start(), title))
|
||||
return out
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
# Get main page
|
||||
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
|
||||
html = r.text
|
||||
print(f"Length: {len(html)}")
|
||||
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
print(f"Tables: {len(tables)}")
|
||||
|
||||
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
|
||||
# Best heuristic: title closest before each table
|
||||
title_positions = find_table_titles(html)
|
||||
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
|
||||
|
||||
table_with_title = []
|
||||
for tp in table_positions:
|
||||
# Find closest title before
|
||||
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
|
||||
if candidates:
|
||||
title = candidates[-1][1]
|
||||
else:
|
||||
title = "Unknown"
|
||||
table_with_title.append((tp, title))
|
||||
|
||||
print("\n=== Table titles ===")
|
||||
for i, (tp, t) in enumerate(table_with_title[:8]):
|
||||
print(f" Table {i+1}: {t[:80]}")
|
||||
|
||||
# Manual mapping based on Damir's request: extract all visible league tables
|
||||
# Looking at output: Tables 1-5 with kluba names
|
||||
LEAGUES_2025_26 = [
|
||||
# Idx, Name, Razina, Spol
|
||||
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
|
||||
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
|
||||
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
|
||||
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
|
||||
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
|
||||
]
|
||||
|
||||
total_inserted = 0
|
||||
pgz_seen = set()
|
||||
|
||||
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
|
||||
rows = parse_table(html, idx)
|
||||
if not rows: continue
|
||||
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
|
||||
for r in rows[:3]:
|
||||
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
|
||||
|
||||
# Get/create natjecanje
|
||||
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
|
||||
nr = cu.fetchone()
|
||||
if nr: natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
|
||||
RETURNING id""", (natj_naziv, razina))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
klub_id = None
|
||||
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
if kr:
|
||||
klub_id = kr[0]
|
||||
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
||||
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
|
||||
json.dumps({"spol": spol})))
|
||||
total_inserted += 1
|
||||
|
||||
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
|
||||
|
||||
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
|
||||
FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
|
||||
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HOS lige ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
||||
|
||||
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hos_cvf' AND k.region='PGŽ'
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HOS ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
|
||||
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
HOS-CVF scraper - Hrvatski odbojkaški savez league standings.
|
||||
Path: /opt/pgz-sport/scrapers/hos_scraper.py
|
||||
"""
|
||||
import requests, re, json, psycopg2, html as ihtml
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
def parse_table(html, table_idx):
|
||||
"""Parse a single table - return rows."""
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
if table_idx >= len(tables): return []
|
||||
rows = re.findall(r'<tr[^>]*>(.+?)</tr>', tables[table_idx], re.DOTALL)
|
||||
out = []
|
||||
for row in rows:
|
||||
cells = re.findall(r'<t[hd][^>]*>(.*?)</t[hd]>', row, re.DOTALL)
|
||||
clean = [ihtml.unescape(re.sub(r'<[^>]+>','',c)).strip() for c in cells]
|
||||
if not clean or not clean[0]: continue
|
||||
# Skip header
|
||||
if clean[0] in ('','#','Pos') or 'Utakmice' in (clean[1] if len(clean)>1 else ''): continue
|
||||
# Format: ['1.', '', 'HAOK MLADOST', '18', '18', '0', '36']
|
||||
try:
|
||||
poz_match = re.match(r'(\d+)', clean[0])
|
||||
if not poz_match: continue
|
||||
poz = int(poz_match.group(1))
|
||||
if len(clean) < 6: continue
|
||||
klub = clean[2] if clean[2] else clean[1]
|
||||
if not klub: continue
|
||||
out.append({
|
||||
"poz": poz,
|
||||
"klub": klub,
|
||||
"uk": int(clean[3]),
|
||||
"pob": int(clean[4]),
|
||||
"por": int(clean[5]),
|
||||
"bod": int(clean[6]) if len(clean) > 6 else 0,
|
||||
"ner": 0,
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
return out
|
||||
|
||||
def find_table_titles(html):
|
||||
"""Find h2/h3 headers + their position to associate with following tables."""
|
||||
# Use regex to find heading + nearest table
|
||||
out = []
|
||||
# Finds positions of titles
|
||||
for m in re.finditer(r'<(h[1-5])[^>]*>(.*?)</\1>', html, re.DOTALL):
|
||||
title = ihtml.unescape(re.sub(r'<[^>]+>','', m.group(2))).strip()
|
||||
if title and len(title) > 5:
|
||||
out.append((m.start(), title))
|
||||
return out
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
# Get main page
|
||||
r = requests.get("https://hos-cvf.hr/", headers=HDR, timeout=20)
|
||||
html = r.text
|
||||
print(f"Length: {len(html)}")
|
||||
|
||||
tables = re.findall(r'<table[^>]*>(.+?)</table>', html, re.DOTALL)
|
||||
print(f"Tables: {len(tables)}")
|
||||
|
||||
# Tables 1-5 are visible at home page (Superliga M, Superliga Ž, ?, 1.M Liga, 1.Ž Liga or similar)
|
||||
# Best heuristic: title closest before each table
|
||||
title_positions = find_table_titles(html)
|
||||
table_positions = [m.start() for m in re.finditer(r'<table[^>]*>', html)]
|
||||
|
||||
table_with_title = []
|
||||
for tp in table_positions:
|
||||
# Find closest title before
|
||||
candidates = [(pos, t) for pos, t in title_positions if pos < tp]
|
||||
if candidates:
|
||||
title = candidates[-1][1]
|
||||
else:
|
||||
title = "Unknown"
|
||||
table_with_title.append((tp, title))
|
||||
|
||||
print("\n=== Table titles ===")
|
||||
for i, (tp, t) in enumerate(table_with_title[:8]):
|
||||
print(f" Table {i+1}: {t[:80]}")
|
||||
|
||||
# Manual mapping based on Damir's request: extract all visible league tables
|
||||
# Looking at output: Tables 1-5 with kluba names
|
||||
LEAGUES_2025_26 = [
|
||||
# Idx, Name, Razina, Spol
|
||||
(0, "Supersport Superliga (M) 2025/26", "Superliga", "M"),
|
||||
(1, "Supersport Superliga (Ž) 2025/26", "Superliga", "Ž"),
|
||||
(2, "Liga doigravanje (M) 2025/26", "Doigravanje", "M"), # Maybe
|
||||
(3, "Supersport Superliga 2 (M) 2025/26", "Superliga 2", "M"),
|
||||
(4, "Supersport Superliga 2 (Ž) 2025/26", "Superliga 2", "Ž"),
|
||||
]
|
||||
|
||||
total_inserted = 0
|
||||
pgz_seen = set()
|
||||
|
||||
for idx, natj_naziv, razina, spol in LEAGUES_2025_26:
|
||||
rows = parse_table(html, idx)
|
||||
if not rows: continue
|
||||
print(f"\n=== {natj_naziv} ({len(rows)} klubova) ===")
|
||||
for r in rows[:3]:
|
||||
print(f" {r['poz']:>2}. {r['klub']:<30} {r['bod']:>2}b {r['pob']}p {r['por']}por")
|
||||
|
||||
# Get/create natjecanje
|
||||
cu.execute("SELECT id FROM pgz_sport.natjecanja WHERE naziv=%s LIMIT 1", (natj_naziv,))
|
||||
nr = cu.fetchone()
|
||||
if nr: natj_id = nr[0]
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja (naziv, sport, razina, sezona, source, source_url)
|
||||
VALUES (%s, 'odbojka', %s, '2025/26', 'hos_cvf', 'https://hos-cvf.hr/')
|
||||
RETURNING id""", (natj_naziv, razina))
|
||||
natj_id = cu.fetchone()[0]
|
||||
print(f" Created natjecanje id={natj_id}")
|
||||
|
||||
cu.execute("DELETE FROM pgz_sport.natjecanja_tablice WHERE natjecanje_id=%s AND source='hos_cvf'", (natj_id,))
|
||||
|
||||
for r in rows:
|
||||
klub_id = None
|
||||
cu.execute("""SELECT id, region FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s) OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='odbojka' THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""", (r['klub'], f"%{r['klub']}%"))
|
||||
kr = cu.fetchone()
|
||||
if kr:
|
||||
klub_id = kr[0]
|
||||
if kr[1] == 'PGŽ': pgz_seen.add(r['klub'])
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.natjecanja_tablice
|
||||
(natjecanje_id, klub_id, klub_naziv, pozicija, odigrano, pobjede,
|
||||
nerijeseno, porazi, gol_z, gol_p, gol_razlika, bodovi, source, source_url, updated_at, extra_data)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, 0, %s, 0, 0, 0, %s, 'hos_cvf', 'https://hos-cvf.hr/', now(), %s::jsonb)""",
|
||||
(natj_id, klub_id, r['klub'], r['poz'], r['uk'], r['pob'], r['por'], r['bod'],
|
||||
json.dumps({"spol": spol})))
|
||||
total_inserted += 1
|
||||
|
||||
print(f"\n=== TOTAL: {total_inserted}, PGŽ klubovi: {pgz_seen} ===")
|
||||
|
||||
cu.execute("""SELECT n.naziv, count(t.*), count(*) FILTER (WHERE t.klub_id IS NOT NULL)
|
||||
FROM pgz_sport.natjecanja n
|
||||
LEFT JOIN pgz_sport.natjecanja_tablice t ON n.id=t.natjecanje_id AND t.source='hos_cvf'
|
||||
WHERE n.source='hos_cvf' GROUP BY n.id, n.naziv ORDER BY n.id""")
|
||||
print("\n=== HOS lige ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3} klubova ({r[2]} matched) {r[0]}")
|
||||
|
||||
cu.execute("""SELECT n.naziv, t.pozicija, t.klub_naziv, t.bodovi, k.id, k.naziv
|
||||
FROM pgz_sport.natjecanja_tablice t
|
||||
JOIN pgz_sport.natjecanja n ON n.id=t.natjecanje_id
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.id=t.klub_id
|
||||
WHERE t.source='hos_cvf' AND k.region='PGŽ'
|
||||
ORDER BY n.naziv, t.pozicija""")
|
||||
print("\n=== PGŽ klubovi u HOS ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0][:30]:<30} {r[1]:>2}. {r[2]:<25} {r[3]:>3}b -> {r[4]} '{r[5]}'")
|
||||
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details)
|
||||
VALUES ('natjecanja_tablice', 'hos_scrape', 'hos_cvf', NULL, %s::jsonb)""",
|
||||
(json.dumps({"inserted": total_inserted, "pgz_seen": list(pgz_seen)}),))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Executable
+178
@@ -0,0 +1,178 @@
|
||||
#!/usr/bin/env python3
|
||||
"""HVS Riznica - parse direct HTML, no Playwright. Extract champions per season."""
|
||||
import re, requests, psycopg2
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
CATEGORIES = [
|
||||
("Prvenstvo Hrvatske - muškarci", "prvenstvo-hrvatske-muskarci"),
|
||||
("Prvenstvo Hrvatske - žene", "prvenstvo-hrvatske-zene"),
|
||||
("Kup Hrvatske - muškarci", "kup-hrvatske-muskarci"),
|
||||
("Kup Hrvatske - žene", "kup-hrvatske-zene"),
|
||||
("Trofej Toni Nardelli", "trofej-toni-nardelli"),
|
||||
("Vaterpolist godine", "vaterpolist"),
|
||||
("Vaterpolistica godine", "vaterpolistica"),
|
||||
]
|
||||
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
HDR = {"User-Agent": UA}
|
||||
|
||||
inserted_total = 0
|
||||
all_winners = []
|
||||
for label, slug in CATEGORIES:
|
||||
url = f"https://hvs.hr/riznica/{slug}/"
|
||||
print(f"\n=== {label} ({slug}) ===", flush=True)
|
||||
try:
|
||||
r = requests.get(url, headers=HDR, timeout=15)
|
||||
if r.status_code != 200:
|
||||
print(f" HTTP {r.status_code}"); continue
|
||||
html = r.text
|
||||
|
||||
# Extract slides - each riznica__slide block
|
||||
# Pattern: extract slide blocks with championship name + year
|
||||
# Each slide has competition name OR medal name (for Vaterpolist categories)
|
||||
slides_re = re.compile(
|
||||
r'<div class="riznica__slide[^"]*"[^>]*>(.+?)(?=<div class="riznica__slide|<section|<footer)',
|
||||
re.DOTALL
|
||||
)
|
||||
slides = slides_re.findall(html)
|
||||
print(f" Slides: {len(slides)}", flush=True)
|
||||
|
||||
# Each slide we look for: champion name + year
|
||||
# But year shows separately in <h2 class="riznica__competitions__current"> for current visible
|
||||
# Actually each slide contains its own competition details
|
||||
|
||||
# Better approach: extract all competition name spans in order
|
||||
comp_names = re.findall(r'<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# And champion images for Vaterpolist (medal__name)
|
||||
medal_names = re.findall(r'<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# And years
|
||||
years = re.findall(r'<h2 class="riznica__competitions__current"[^>]*>\s*([^<]+?)\s*<', html)
|
||||
# Or in name spans
|
||||
years_alt = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html)
|
||||
|
||||
# Also single-year format (Trofej, Vaterpolist often have just single year)
|
||||
years_single = re.findall(r'<h2 class="riznica__competitions__name"[^>]*>\s*([0-9]{4}\.)\s*<', html)
|
||||
|
||||
all_years = list(set(years + years_alt + years_single))
|
||||
all_years_sorted = sorted([y.strip() for y in all_years if y.strip()])
|
||||
print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True)
|
||||
if comp_names: print(f" Sample champ: {comp_names[:3]}")
|
||||
if medal_names: print(f" Sample medal: {medal_names[:3]}")
|
||||
if all_years_sorted: print(f" Years range: {all_years_sorted[0]} → {all_years_sorted[-1]}")
|
||||
|
||||
# The names may be aligned with years sequentially
|
||||
# Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards)
|
||||
names = comp_names if comp_names else medal_names
|
||||
names = [n.strip() for n in names if n.strip()]
|
||||
|
||||
# The champions in HTML order represent the seasons in display order
|
||||
# Map them to years - assume index alignment with sorted years if same length
|
||||
# Otherwise, the page shows multiple slides — same name may repeat
|
||||
|
||||
# For each non-empty name, create entry
|
||||
# Best guess: names list and years list are PARALLEL (same length, in order on page)
|
||||
# Pages show all-time history, so years_alt (with format) is most reliable
|
||||
|
||||
# Smart: if len(names) matches len(all_years_sorted), pair them
|
||||
# Else create entries with name+year separately and link by index
|
||||
|
||||
# Even better: each "slide" block contains 1 name + 1 year contextually nearby
|
||||
# Find pairs by extracting full slides and matching internal patterns
|
||||
slide_pat = re.compile(
|
||||
r'(?:<h2 class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<|<h2 class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<)'
|
||||
r'.*?(?:<h2 class="riznica__competitions__current"[^>]*>\s*([0-9./]+)|<h2 class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+))',
|
||||
re.DOTALL
|
||||
)
|
||||
# That regex too complex - use simpler split approach
|
||||
# Split by slide divs
|
||||
all_records = []
|
||||
slide_blocks = re.split(r'<div class="riznica__slide(?:\s+is-visible)?">', html)
|
||||
for blk in slide_blocks[1:]:
|
||||
# Find name (champion or medal_name)
|
||||
name_m = re.search(r'class="riznica__competition__name"[^>]*>\s*([^<]+?)\s*<', blk)
|
||||
if not name_m:
|
||||
name_m = re.search(r'class="riznica__medal__name"[^>]*>\s*([^<]+?)\s*<', blk)
|
||||
year_m = re.search(r'class="riznica__competitions__current"[^>]*>\s*([0-9./]+)', blk)
|
||||
if not year_m:
|
||||
year_m = re.search(r'class="riznica__competitions__name(?:[^"]*)"[^>]*>\s*([0-9./]+?)\s*<', blk)
|
||||
# Image
|
||||
img_m = re.search(r'<img src="([^"]+)"', blk)
|
||||
|
||||
if name_m and year_m:
|
||||
all_records.append({
|
||||
"name": name_m.group(1).strip(),
|
||||
"year": year_m.group(1).strip(),
|
||||
"img": img_m.group(1) if img_m else None
|
||||
})
|
||||
|
||||
print(f" Parsed records: {len(all_records)}", flush=True)
|
||||
if all_records:
|
||||
for rec in all_records[:3]: print(f" {rec['year']}: {rec['name']}")
|
||||
|
||||
# Insert into DB
|
||||
for rec in all_records:
|
||||
year = rec['year']
|
||||
champ = rec['name']
|
||||
|
||||
# Find klub_id
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) LIKE LOWER(%s) OR LOWER(naziv) = LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN sport='vaterpolo' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(f'%{champ}%', champ))
|
||||
kid_row = cu.fetchone()
|
||||
klub_id = kid_row[0] if kid_row else None
|
||||
|
||||
try:
|
||||
cu.execute("""INSERT INTO pgz_sport.klub_sezona
|
||||
(klub_id, klub_naziv, sezona, natjecanje, plasiranje, trofej, source, source_url)
|
||||
VALUES (%s, %s, %s, %s, 1, %s, 'hvs_riznica', %s)
|
||||
ON CONFLICT DO NOTHING""",
|
||||
(klub_id, champ, year, label,
|
||||
f'1. mjesto - {label} {year}', url))
|
||||
if cu.rowcount > 0:
|
||||
inserted_total += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
all_winners.append({"category": label, "count": len(all_records), "records": all_records})
|
||||
|
||||
except Exception as e:
|
||||
print(f" EXC: {e}")
|
||||
|
||||
print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===")
|
||||
|
||||
# Audit log
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""",
|
||||
(f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',))
|
||||
|
||||
# Top winners
|
||||
cu.execute("""SELECT klub_naziv, count(*) AS naslova
|
||||
FROM pgz_sport.klub_sezona
|
||||
WHERE source='hvs_riznica'
|
||||
GROUP BY klub_naziv
|
||||
ORDER BY count(*) DESC LIMIT 15""")
|
||||
print("\n=== TOP HVS prvaci/medalisti ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3}× {r[0]}")
|
||||
|
||||
# PGŽ-relevant
|
||||
cu.execute("""SELECT k.naziv, count(ks.*) AS naslova
|
||||
FROM pgz_sport.klub_sezona ks
|
||||
JOIN pgz_sport.klubovi k ON k.id = ks.klub_id
|
||||
WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL
|
||||
GROUP BY k.naziv
|
||||
ORDER BY count(*) DESC""")
|
||||
print("\n=== PGŽ klubovi sa HVS naslovima ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[1]:>3}× {r[0]}")
|
||||
|
||||
conn.close()
|
||||
Executable
+117
@@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python3
|
||||
import psycopg2, json, re
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
totals = json.load(open('/opt/pgz-sport/data/sport_totals_2025.json'))
|
||||
print(f"Sport totals 2025: {len(totals)}")
|
||||
|
||||
SPORT_TO_SAVEZ_KEYS = {
|
||||
'atletika':'atletski','biciklizam':'biciklist','boćanje':'boćar','boks':'boksač',
|
||||
'gimnastika':'gimnast','jedriličarstvo':'jedrili','judo':'judo','karate':'karate',
|
||||
'kendo':'kendo','kickboxing':'kickbox','košarka':'košark','kuglanje':'kuglač',
|
||||
'nogomet':'nogomet','odbojka':'odbojkaš','parasport':'parasport','ples':'ples',
|
||||
'plivanje':'plivačk','ronilaštvo':'ronila','rukomet':'rukometn',
|
||||
'sinkronizirano plivanje':'plivačk','skijanje':'skijaš','sport gluhih':'gluh',
|
||||
'sportski ribolov':'ribolov','sportsko penjanje':'penjač','stolni tenis':'stolnotenis',
|
||||
'streličarstvo':'streličar','streljaštvo':'streljač','tenis':'teniski',
|
||||
'triatlon':'triatlon','vaterpolo':'vaterpol','veslanje':'veslač',
|
||||
}
|
||||
|
||||
cu.execute("SELECT id, naziv FROM pgz_sport.savezi WHERE naziv ILIKE '%PGŽ%' OR naziv ILIKE '%Primorsko%'")
|
||||
all_savezi = cu.fetchall()
|
||||
print(f"PGŽ savezi: {len(all_savezi)}")
|
||||
|
||||
savez_map = {}
|
||||
for sport_lc, key in SPORT_TO_SAVEZ_KEYS.items():
|
||||
for sid, naziv in all_savezi:
|
||||
if key.lower() in naziv.lower():
|
||||
savez_map[sport_lc] = sid; break
|
||||
|
||||
# Update statistika_saveza for 2025
|
||||
n_upd = 0
|
||||
for sport_lc, data in totals.items():
|
||||
sid = savez_map.get(sport_lc)
|
||||
if not sid: continue
|
||||
cu.execute("SELECT id FROM pgz_sport.statistika_saveza WHERE savez_id=%s AND godina=%s", (sid, 2025))
|
||||
e = cu.fetchone()
|
||||
if e:
|
||||
cu.execute("UPDATE pgz_sport.statistika_saveza SET registriranih=%s WHERE id=%s", (data['total'], e[0]))
|
||||
else:
|
||||
cu.execute("INSERT INTO pgz_sport.statistika_saveza (savez_id, godina, registriranih) VALUES (%s,%s,%s)",
|
||||
(sid, 2025, data['total']))
|
||||
n_upd += 1
|
||||
print(f"Statistika_saveza 2025 updated: {n_upd}")
|
||||
|
||||
# Kategorizirani
|
||||
KAT = json.load(open('/opt/pgz-sport/data/kategorizirani_2025.json'))
|
||||
print(f"\nKategorizirani: {len(KAT)}")
|
||||
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija TEXT")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija_od DATE")
|
||||
cu.execute("ALTER TABLE pgz_sport.clanovi ADD COLUMN IF NOT EXISTS hoo_kategorija_do DATE")
|
||||
|
||||
n_matched = 0; n_inserted = 0; n_dup = 0
|
||||
seen = set()
|
||||
|
||||
def parse_d(s):
|
||||
try:
|
||||
d, m, y = s.split('.')
|
||||
return f"{y}-{int(m):02d}-{int(d):02d}"
|
||||
except: return None
|
||||
|
||||
for k in KAT:
|
||||
ime = k['ime'].strip()
|
||||
prezime = k['prezime'].strip()
|
||||
sport = k['sport'].strip().lower()
|
||||
klub = k['klub'].strip().strip('"').strip()
|
||||
mjesto = k['mjesto'].strip()
|
||||
hoo_kat = k['hoo_kategorija']
|
||||
od = parse_d(k['vrijedi_od'])
|
||||
do = parse_d(k['vrijedi_do'])
|
||||
|
||||
key = (ime.lower(), prezime.lower(), sport)
|
||||
if key in seen: n_dup += 1; continue
|
||||
seen.add(key)
|
||||
|
||||
cu.execute("""SELECT id FROM pgz_sport.clanovi WHERE LOWER(ime)=LOWER(%s) AND LOWER(prezime)=LOWER(%s)
|
||||
AND (sport IS NULL OR LOWER(sport)=LOWER(%s)) LIMIT 1""", (ime, prezime, sport))
|
||||
row = cu.fetchone()
|
||||
if row:
|
||||
cu.execute("""UPDATE pgz_sport.clanovi SET hoo_kategorija=%s, hoo_kategorija_od=%s,
|
||||
hoo_kategorija_do=%s, sport=COALESCE(sport, %s) WHERE id=%s""",
|
||||
(hoo_kat, od, do, sport, row[0]))
|
||||
n_matched += 1
|
||||
else:
|
||||
klub_id = None
|
||||
cu.execute("SELECT id FROM pgz_sport.klubovi WHERE LOWER(naziv) LIKE LOWER(%s) AND region IS NOT NULL LIMIT 1",
|
||||
(f"%{klub[:30]}%",))
|
||||
kr = cu.fetchone()
|
||||
if kr: klub_id = kr[0]
|
||||
cu.execute("""INSERT INTO pgz_sport.clanovi (ime, prezime, sport, mjesto_rodenja, klub_id,
|
||||
hoo_kategorija, hoo_kategorija_od, hoo_kategorija_do)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s)""",
|
||||
(ime, prezime, sport, mjesto, klub_id, hoo_kat, od, do))
|
||||
n_inserted += 1
|
||||
|
||||
print(f"Matched: {n_matched}, Inserted: {n_inserted}, Dup: {n_dup}")
|
||||
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.clanovi WHERE hoo_kategorija IS NOT NULL")
|
||||
print(f"\nSportaša s HOO kategorijom: {cu.fetchone()[0]}")
|
||||
cu.execute("SELECT hoo_kategorija, count(*) FROM pgz_sport.clanovi WHERE hoo_kategorija IS NOT NULL GROUP BY hoo_kategorija ORDER BY hoo_kategorija")
|
||||
for r in cu.fetchall():
|
||||
print(f" HOO {r[0]}: {r[1]}")
|
||||
cu.execute("SELECT count(*) FROM pgz_sport.clanovi")
|
||||
print(f"\nUkupno sportaša: {cu.fetchone()[0]}")
|
||||
|
||||
# Sport totals 2025 result
|
||||
cu.execute("""SELECT s.naziv, ss.registriranih FROM pgz_sport.statistika_saveza ss
|
||||
JOIN pgz_sport.savezi s ON s.id=ss.savez_id
|
||||
WHERE ss.godina=2025 AND ss.registriranih > 0 ORDER BY ss.registriranih DESC LIMIT 12""")
|
||||
print("\n2025 statistika_saveza top 12:")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0]:50} {r[1]:>6}")
|
||||
|
||||
conn.close()
|
||||
Executable
+114
@@ -0,0 +1,114 @@
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
DOCS = [
|
||||
('Zakon o porezu na dobit','Porezni tretman sportskih udruga i klubova.','zakon','RH','Sabor RH',None,'NN 177/04+','https://www.zakon.hr/z/99/Zakon-o-porezu-na-dobit',['porez','udruga'],'2004-12-23'),
|
||||
('Zakon o PDV-u — sport','PDV tretman ulaznica i sponzorstava.','zakon','RH','Sabor RH',None,'NN 73/13','https://www.zakon.hr/z/186/Zakon-o-porezu-na-dodanu-vrijednost',['PDV','ulaznice'],'2013-06-19'),
|
||||
('Zakon o radu — sportaš/trener','Radni odnosi profesionalnih sportaša i trenera.','zakon','RH','Sabor RH',None,'NN 93/14, 151/22, 64/23','https://www.zakon.hr/z/307/Zakon-o-radu',['radni odnos','trener'],'2014-07-25'),
|
||||
('Zakon o pravu na pristup informacijama','Obveze JLS za objavu o financiranju sporta.','zakon','RH','Sabor RH',None,'NN 25/13, 85/15, 69/22','https://www.zakon.hr/z/126/Zakon-o-pravu-na-pristup-informacijama',['transparentnost'],'2013-02-15'),
|
||||
('Zakon o volonterstvu','Volonterski rad u sportskim klubovima.','zakon','RH','Sabor RH',None,'NN 58/07, 22/13, 84/21','https://www.zakon.hr/z/220/Zakon-o-volonterstvu',['volonter'],'2007-06-08'),
|
||||
('Pravilnik o registru sportskih udruga MTS','MTS javni registar.','pravilnik','RH','MTS',None,'NN 31/24','https://mtus.gov.hr/sport-2625/registri-sportskih-udruga/22516',['registar','MTS'],'2024-03-15'),
|
||||
('Pravilnik o stručnim poslovima u sportu','Kvalifikacije i licence stručnog kadra.','pravilnik','RH','MTS',None,'NN 89/23','https://narodne-novine.nn.hr/clanci/sluzbeni/2023_08_89_1334.html',['licenca','trener'],'2023-08-04'),
|
||||
('Pravilnik o statusu sportaša s posebnim statusom','Vrhunski sportaši — stipendije, doprinosi.','pravilnik','RH','MTS',None,'NN 14/23','https://mtus.gov.hr/sport/2625',['vrhunski','staž'],'2023-02-01'),
|
||||
('Pravilnik o korištenju olimpijske oznake','HOO pravila o olimpijskim simbolima.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['olimpijski'],'2020-01-01'),
|
||||
('Pravilnik o članicama HOO-a','Kriteriji za nacionalne saveze.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/clanstvo',['HOO','clanstvo'],'2022-01-01'),
|
||||
('Etički kodeks HOO-a','Etička načela u sportu.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['etika'],'2021-01-01'),
|
||||
('Pravilnik HOO o sportskim stipendijama','HOO program stipendiranja.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/sport-u-hrvatskoj/stipendije',['stipendija'],'2023-01-01'),
|
||||
('Pravilnik o nagradama HOO','HOO godišnje nagrade.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/dokumenti',['nagrade'],'2022-01-01'),
|
||||
('Statut HNS-a','Hrvatski nogometni savez statut.','statut','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS'],'2023-01-01'),
|
||||
('Statut HRS-a','Hrvatski rukometni savez statut.','statut','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS'],'2023-01-01'),
|
||||
('Statut HKS-a','Hrvatski košarkaški savez statut.','statut','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS'],'2023-01-01'),
|
||||
('Statut HBS Boćarski','Hrvatski boćarski savez.','statut','Savez','HBS','boćanje',None,'https://hrvatski-bocarski-savez.hr/savez/dokumenti/',['HBS','boćanje'],'2022-01-01'),
|
||||
('Statut HOS Odbojkaški','Hrvatski odbojkaški savez.','statut','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS'],'2022-01-01'),
|
||||
('Statut HŠS Šahovski','Hrvatski šahovski savez.','statut','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','FIDE'],'2022-01-01'),
|
||||
('Statut HVS Veslački','Hrvatski veslački savez.','statut','Savez','HVS','veslanje',None,'https://www.veslanje.hr/dokumenti',['HVS'],'2021-01-01'),
|
||||
('Statut HJS Jedriličarski','Hrvatski jedriličarski savez.','statut','Savez','HJS','jedriličarstvo',None,'https://www.hjs.hr/dokumenti',['HJS'],'2022-01-01'),
|
||||
('Statut HAS Atletski','Hrvatski atletski savez.','statut','Savez','HAS','atletika',None,'https://www.has.hr/index.php/dokumenti',['HAS','World Athletics'],'2023-01-01'),
|
||||
('Statut HVPS Vaterpolski','Hrvatski vaterpolski savez.','statut','Savez','HVPS','vaterpolo',None,'https://hvs.hr/dokumenti',['HVPS'],'2022-01-01'),
|
||||
('Statut HPS Plivacki','Hrvatski plivacki savez.','statut','Savez','HPS','plivanje',None,'https://hps.com.hr/dokumenti',['HPS','FINA'],'2022-01-01'),
|
||||
('Statut HTS Teniski','Hrvatski teniski savez.','statut','Savez','HTS','tenis',None,'https://hts.hr/dokumenti/',['HTS','ITF'],'2023-01-01'),
|
||||
('Statut HSTS Stolnoteniski','Hrvatski stolnoteniski savez.','statut','Savez','HSTS','stolni tenis',None,'https://hsts.hr/dokumenti/',['HSTS','ITTF'],'2022-01-01'),
|
||||
('Statut HBS-UCI Biciklistički','Hrvatski biciklistički savez.','statut','Savez','HBS-UCI','biciklizam',None,'https://www.hbs.hr/dokumenti',['biciklizam','UCI'],'2022-01-01'),
|
||||
('Statut HKZ Karate','Hrvatski karate savez.','statut','Savez','HKZ','karate',None,'https://www.hkz.hr/dokumenti',['karate','WKF'],'2022-01-01'),
|
||||
('Statut HJSav Judo','Hrvatski judo savez.','statut','Savez','HJSav','judo',None,'https://www.judosavez.hr/dokumenti',['judo','IJF'],'2022-01-01'),
|
||||
('Statut HTKDS Taekwondo','Hrvatski taekwondo savez.','statut','Savez','HTKDS','taekwondo',None,'https://www.taekwondo.hr/dokumenti/',['taekwondo','WT'],'2022-01-01'),
|
||||
('Statut PGZ','Temeljni akt PGŽ-a.','statut','PGZ','PGŽ',None,None,'https://www.pgz.hr/sluzbene-novine',['PGŽ','statut'],'2021-04-01'),
|
||||
('Plan razvoja PGŽ 2021-2027','Plan razvoja županije.','strategija','PGZ','PGŽ',None,None,'https://www.pgz.hr/strategije',['plan','EU fondovi'],'2021-12-01'),
|
||||
('Pravilnik o sufinanciranju opreme PGŽ','Sredstva za opremu klubova.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['oprema'],'2023-01-01'),
|
||||
('Pravilnik o sufinanciranju građevina PGŽ','Investicijska sredstva za građevine.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['građevina'],'2023-01-01'),
|
||||
('Odluka o nagradama u sportu PGŽ','Sustav nagrada.','odluka','PGZ','PGŽ',None,None,'https://www.pgz.hr/odluke',['nagrada'],'2023-01-01'),
|
||||
('Strategija sporta Grada Rijeke 2020-2030','Gradski plan razvoja sporta.','strategija','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/strategija-sporta',['Rijeka'],'2020-12-01'),
|
||||
('Pravilnik javne potrebe sport Rijeka','Postupak dodjele sredstava.','pravilnik','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/pravilnici',['Rijeka','financiranje'],'2023-01-01'),
|
||||
('Strategija sporta Grada Opatije','Gradska strategija.','strategija','Grad Rijeka','Grad Opatija',None,None,'https://www.opatija.hr/sport',['Opatija'],'2021-01-01'),
|
||||
('Program sport Crikvenica','Crikvenički program.','program','Grad Rijeka','Grad Crikvenica',None,None,'https://www.crikvenica.hr/programi',['Crikvenica'],'2024-01-01'),
|
||||
('Program sport Krk','Krčki program.','program','Grad Rijeka','Grad Krk',None,None,'https://www.grad-krk.hr/programi',['Krk'],'2024-01-01'),
|
||||
('Program sport Mali Lošinj','Lošinjski program.','program','Grad Rijeka','Grad Mali Lošinj',None,None,'https://www.mali-losinj.hr/programi',['Lošinj'],'2024-01-01'),
|
||||
('Program sport Delnice','Delnički program.','program','Grad Rijeka','Grad Delnice',None,None,'https://www.delnice.hr/programi',['Delnice'],'2024-01-01'),
|
||||
('Program sport Vrbovsko','Vrbovski program.','program','Grad Rijeka','Grad Vrbovsko',None,None,'https://www.vrbovsko.hr/programi',['Vrbovsko'],'2024-01-01'),
|
||||
('Program sport Novi Vinodolski','Novovinodolski program.','program','Grad Rijeka','Grad Novi Vinodolski',None,None,'https://www.novi-vinodolski.hr/programi',['Novi Vinodolski'],'2024-01-01'),
|
||||
('Program sport Kraljevica','Kraljevički program.','program','Grad Rijeka','Grad Kraljevica',None,None,'https://www.kraljevica.hr/programi',['Kraljevica'],'2024-01-01'),
|
||||
('Program sport Bakar','Bakarski program.','program','Grad Rijeka','Grad Bakar',None,None,'https://www.bakar.hr/programi',['Bakar'],'2024-01-01'),
|
||||
('Program sport Cres','Creski program.','program','Grad Rijeka','Grad Cres',None,None,'https://www.cres.hr/programi',['Cres'],'2024-01-01'),
|
||||
('Program sport Rab','Rabski program.','program','Grad Rijeka','Grad Rab',None,None,'https://www.rab.hr/programi',['Rab'],'2024-01-01'),
|
||||
('Program sport Kastav','Kastavski program.','program','Grad Rijeka','Grad Kastav',None,None,'https://www.kastav.hr/programi',['Kastav'],'2024-01-01'),
|
||||
('WADA Lista zabranjenih tvari','WADA Prohibited List godišnje.','pravilnik','EU','WADA',None,None,'https://www.wada-ama.org/en/prohibited-list',['WADA'],'2024-01-01'),
|
||||
('Pravilnik HASMS o TUE','TUE postupak u Hrvatskoj.','pravilnik','RH','HASMS',None,None,'https://hasms.hr/anti-doping/dokumenti/',['TUE','HASMS'],'2023-01-01'),
|
||||
('European Sports Charter','CoE povelja o sportu.','pravilnik','EU','Council of Europe',None,None,'https://rm.coe.int/european-sports-charter-1992',['CoE'],'2021-10-13'),
|
||||
('Macolin Convention','CoE protiv namještanja.','pravilnik','EU','Council of Europe',None,None,'https://www.coe.int/en/web/sport/macolin-convention',['namještanje'],'2014-09-18'),
|
||||
('UEFA Financial Sustainability','Financijski Fair Play.','pravilnik','EU','UEFA','nogomet',None,'https://documents.uefa.com/v/u/MFFvQjlRCJF7RJYIoyzMRA',['UEFA','FFP'],'2022-06-01'),
|
||||
('Pravilnik o radu školskih sportskih društava','ŠŠD organizacija.','pravilnik','RH','MZO + MTS',None,None,'https://mzo.gov.hr/sport',['ŠŠD'],'2022-01-01'),
|
||||
('Pravilnik za status parasportaša','Klasifikacija parasportaša.','pravilnik','RH','HPO',None,None,'https://www.hpo.hr/Dokumenti',['parasport','HPO'],'2022-01-01'),
|
||||
('IPC klasifikacija parasportaša','IPC klasifikacije po sportu.','pravilnik','EU','IPC',None,None,'https://www.paralympic.org/classification',['IPC','parasport'],'2023-01-01'),
|
||||
('Zakon o lovstvu','Temeljni zakon o lovstvu.','zakon','RH','Sabor RH','lov','NN 99/18, 32/19, 153/22','https://www.zakon.hr/z/127/Zakon-o-lovstvu',['lov','divljač'],'2018-11-09'),
|
||||
('Pravilnik o lovostaju','Lovne sezone.','pravilnik','RH','Min. poljoprivrede','lov',None,'https://mps.gov.hr/lovstvo/dokumenti',['lovostaj'],'2022-01-01'),
|
||||
('Pravilnik o lovniku','Pravila lova, oprema.','pravilnik','RH','Min. poljoprivrede','lov',None,'https://mps.gov.hr/lovstvo/dokumenti',['lovnik'],'2022-01-01'),
|
||||
('Pravila igre Nogomet IFAB','IFAB Laws of the Game.','pravilnik','EU','IFAB/FIFA','nogomet',None,'https://www.theifab.com/laws/',['IFAB','FIFA'],'2024-07-01'),
|
||||
('Pravila igre Rukomet IHF','IHF Rules.','pravilnik','EU','IHF','rukomet',None,'https://www.ihf.info/regulations-documents',['IHF'],'2022-07-01'),
|
||||
('Pravila igre Košarka FIBA','FIBA Official Rules.','pravilnik','EU','FIBA','košarka',None,'https://www.fiba.basketball/basketballrules',['FIBA'],'2024-10-01'),
|
||||
('Propisi natjecanja HBS','HBS volo/raffa/petanque.','pravilnik_savez','Savez','HBS','boćanje',None,'https://hrvatski-bocarski-savez.hr/savez/dokumenti/',['HBS','natjecanje'],'2024-01-01'),
|
||||
('Propisi natjecanja HVS','HVS regate, kategorije.','pravilnik_savez','Savez','HVS','veslanje',None,'https://www.veslanje.hr/dokumenti',['HVS'],'2024-01-01'),
|
||||
('Propisi natjecanja HJS','HJS regate, RRS.','pravilnik_savez','Savez','HJS','jedriličarstvo',None,'https://www.hjs.hr/dokumenti',['HJS'],'2024-01-01'),
|
||||
('Propisi natjecanja HŠS','HŠS FIDE pravila.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','FIDE'],'2024-01-01'),
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB)
|
||||
conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
cu.execute("SELECT LOWER(COALESCE(title, '')) FROM pgz_sport.dokumenti")
|
||||
existing = set(r[0] for r in cu.fetchall())
|
||||
|
||||
n_added = 0; n_skipped = 0; n_err = 0
|
||||
for d in DOCS:
|
||||
if d[0].lower() in existing:
|
||||
n_skipped += 1; continue
|
||||
try:
|
||||
cu.execute('''INSERT INTO pgz_sport.dokumenti
|
||||
(title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik,
|
||||
izvor_url, kljucne_rijeci, izdano_datum, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)''', d)
|
||||
n_added += 1
|
||||
existing.add(d[0].lower())
|
||||
except Exception as e:
|
||||
n_err += 1
|
||||
print(f' err {d[0][:40]}: {e}')
|
||||
|
||||
print(f'Added: {n_added}, Skipped: {n_skipped}, Errors: {n_err}')
|
||||
cu.execute('SELECT count(*) FROM pgz_sport.dokumenti')
|
||||
print(f'TOTAL: {cu.fetchone()[0]}')
|
||||
|
||||
cu.execute('SELECT razina, count(*) FROM pgz_sport.dokumenti GROUP BY razina ORDER BY count(*) DESC')
|
||||
print('\nPo razini:')
|
||||
for r in cu.fetchall():
|
||||
print(f' {r[0]:<15} {r[1]}')
|
||||
|
||||
cu.execute('SELECT tip, count(*) FROM pgz_sport.dokumenti GROUP BY tip ORDER BY count(*) DESC')
|
||||
print('\nPo tipu:')
|
||||
for r in cu.fetchall():
|
||||
print(f' {r[0]:<25} {r[1]}')
|
||||
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Executable
+151
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
import psycopg2
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3', user='rinet', password='R1net2026!SecureDB#v7')
|
||||
|
||||
DOCS = [
|
||||
# ═══ PGŽ ZAJEDNICA SPORTOVA godišnje ═══
|
||||
('Detaljna raspodjela sredstava JPS PGŽ 2025','Raspodjela sredstava javnih potreba u sportu PGŽ za 2025. godinu po klubovima i savezima.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2025',['raspodjela','PGŽ','klub','savez','2025'],'2025-01-01'),
|
||||
('Detaljna raspodjela sredstava JPS PGŽ 2024','Raspodjela sredstava javnih potreba u sportu PGŽ za 2024. godinu.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2024',['raspodjela','PGŽ','2024'],'2024-01-01'),
|
||||
('Detaljna raspodjela sredstava JPS PGŽ 2023','Raspodjela sredstava 2023.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2023',['raspodjela','PGŽ','2023'],'2023-01-01'),
|
||||
('Detaljna raspodjela sredstava JPS PGŽ 2022','Raspodjela sredstava 2022.','raspodjela','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/raspodjela-2022',['raspodjela','PGŽ','2022'],'2022-01-01'),
|
||||
('Financijski plan ZS PGŽ 2025','Godišnji financijski plan Zajednice sportova PGŽ za 2025.','plan','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/plan-2025',['plan','financije','ZS PGŽ','2025'],'2025-01-01'),
|
||||
('Financijski izvještaj ZS PGŽ 2024','Godišnji izvještaj Zajednice sportova PGŽ za 2024.','izvjestaj','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/izvjestaj-2024',['izvještaj','PGŽ','2024'],'2025-03-01'),
|
||||
('Financijski izvještaj ZS PGŽ 2023','Izvještaj 2023.','izvjestaj','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/izvjestaj-2023',['izvještaj','PGŽ','2023'],'2024-03-01'),
|
||||
('Statut Zajednice sportova PGŽ','Temeljni akt ZS PGŽ — članstvo, organi, financiranje, ovlasti.','statut','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/statut',['ZS PGŽ','statut'],'2022-01-01'),
|
||||
('Pravilnik o radu ZS PGŽ','Unutarnji pravilnik rada — donošenje odluka, sjednice, glasanje.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/pravilnik-rada',['ZS PGŽ','rad'],'2022-01-01'),
|
||||
('Poslovnik Skupštine ZS PGŽ','Pravila rada Skupštine — kvorum, glasanje, dnevni red.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/poslovnik',['ZS PGŽ','poslovnik','skupština'],'2022-01-01'),
|
||||
|
||||
# ═══ KRITERIJI VREDNOVANJA ═══
|
||||
('Kriteriji za vrednovanje sportske kvalitete u PGŽ','Bodovni sustav za rangiranje klubova/saveza po sportskim rezultatima.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/kriteriji',['kriteriji','vrednovanje','rang'],'2023-01-01'),
|
||||
('Kriteriji za nositelje kvalitete u sportu','Definicija statusa nositelja kvalitete — viši stupanj sufinanciranja.','pravilnik','PGZ','Zajednica sportova PGŽ',None,None,'https://www.pgz.hr/sport/nositelji',['nositelj kvalitete','sufinanciranje'],'2023-01-01'),
|
||||
|
||||
# ═══ HOO DODATNO ═══
|
||||
('Pravilnik o registraciji nacionalnih saveza HOO','Postupak učlanjenja nacionalnog saveza u HOO.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/registracija',['HOO','registracija','nacionalni savez'],'2022-01-01'),
|
||||
('Pravilnik o discipliniranju u HOO sustavu','Disciplinske mjere unutar HOO-a i nacionalnih saveza.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/disciplina',['HOO','disciplina'],'2022-01-01'),
|
||||
('Pravilnik o medijima i komunikacijama HOO','Pravila objave informacija, akreditacije za medije.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/mediji',['HOO','mediji','akreditacija'],'2021-01-01'),
|
||||
('Pravilnik o sportskoj znanosti i istraživanjima HOO','Suradnja sa sveučilištima i znanstvenim institucijama.','pravilnik','HOO','HOO',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/znanost',['HOO','znanost','istraživanje'],'2021-01-01'),
|
||||
|
||||
# ═══ MTS dodatno ═══
|
||||
('Pravilnik o licenciranju trenera u sportu','Sustav licenciranja stručnog kadra — kategorije A/B/C, kontinuirano obrazovanje.','pravilnik','RH','MTS',None,'NN 89/23 i izmjene','https://mtus.gov.hr/dokumenti/sport',['licenca','trener','MTS'],'2023-08-04'),
|
||||
('Pravilnik o sigurnosti na sportskim događanjima','Sigurnosne mjere, redari, zaštitari, video nadzor.','pravilnik','RH','MUP + MTS',None,'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19','https://mup.gov.hr/sigurnost-na-sportu',['sigurnost','redari','navijači','MUP'],'2003-07-15'),
|
||||
('Zakon o sprječavanju nereda na sportskim natjecanjima','Pravna osnova za sigurnosne mjere na utakmicama.','zakon','RH','Sabor RH',None,'NN 117/03, 71/06, 43/09, 34/11, 68/12, 48/13, 19/15, 98/19','https://www.zakon.hr/z/345/Zakon-o-spre%C4%8Davanju-nereda-na-sportskim-natjecanjima',['neredi','navijači','sigurnost'],'2003-07-15'),
|
||||
('Zakon o priznavanju i vrednovanju inozemnih obrazovnih kvalifikacija — sport','Priznavanje stranih trenerskih licenci.','zakon','RH','MZO',None,'NN 69/22','https://www.zakon.hr/z/2856/Zakon-o-priznavanju-i-vrednovanju-inozemnih-obrazovnih-kvalifikacija',['priznavanje','strana licenca'],'2022-06-15'),
|
||||
('Pravilnik o uvjetima za obavljanje djelatnosti sportskog turizma','Sportski kampovi, škole, treninzi za goste.','pravilnik','RH','MTS',None,None,'https://mtus.gov.hr/sportski-turizam',['sportski turizam','kamp'],'2023-01-01'),
|
||||
|
||||
# ═══ HNS DODATNO ═══
|
||||
('Pravilnik o nogometnim sucima HNS','Licenciranje sudaca, dobne kategorije sudaca, ocjenjivanje.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','suci','licenca'],'2024-01-01'),
|
||||
('Pravilnik o klupskoj licenciranju HNS','UEFA i HNS klupska licenca — financijski, infrastrukturni, sportski kriteriji.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','licenca','UEFA','klub'],'2024-01-01'),
|
||||
('Pravilnik o transferu i obeštećenju HNS','Transferi između HNS klubova, obeštećenje za razvoj mladih.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','transfer','obeštećenje','razvoj'],'2024-01-01'),
|
||||
('Pravilnik o trenerima HNS','Licenciranje nogometnih trenera — UEFA Pro/A/B/C kategorije.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','trener','UEFA Pro','UEFA A'],'2024-01-01'),
|
||||
('Pravilnik o ženskom nogometu HNS','Posebni propisi za žensku nogometnu hijerarhiju.','pravilnik_savez','Savez','HNS','nogomet',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','ženski nogomet'],'2023-01-01'),
|
||||
('Pravilnik o malom nogometu HNS','Futsal natjecanja i pravila u HNS sustavu.','pravilnik_savez','Savez','HNS','futsal',None,'https://hns-cff.hr/regulatorni-okvir/',['HNS','futsal','mali nogomet'],'2023-01-01'),
|
||||
|
||||
# ═══ HRS DODATNO ═══
|
||||
('Pravilnik o klupskom licenciranju HRS','Klupska licenca u rukometu — sportski, financijski, infrastrukturni kriteriji.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','licenca','klub'],'2024-01-01'),
|
||||
('Pravilnik o sucima HRS','Licenciranje rukometnih sudaca.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','suci','licenca'],'2024-01-01'),
|
||||
('Pravilnik o trenerima HRS','Licenciranje rukometnih trenera.','pravilnik_savez','Savez','HRS','rukomet',None,'https://hrs.hr/dokumenti/',['HRS','trener','licenca'],'2024-01-01'),
|
||||
|
||||
# ═══ HKS DODATNO ═══
|
||||
('Pravilnik o sucima HKS','Košarkaški suci — licenciranje, ocjenjivanje, kategorizacija.','pravilnik_savez','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS','suci','FIBA'],'2024-01-01'),
|
||||
('Pravilnik o trenerima HKS','Licenciranje košarkaških trenera — FIBA i HKS standardi.','pravilnik_savez','Savez','HKS','košarka',None,'https://www.hks-cbf.hr/dokumenti/',['HKS','trener','FIBA'],'2024-01-01'),
|
||||
|
||||
# ═══ HOS Odbojka dodatno ═══
|
||||
('Pravilnik o registraciji odbojkaša HOS','Registracija odbojkaša, dobne kategorije, transferi.','pravilnik_savez','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS','registracija'],'2024-01-01'),
|
||||
('Pravilnik o sucima HOS','Odbojkaški suci.','pravilnik_savez','Savez','HOS','odbojka',None,'https://hos-cvf.hr/dokumenti/',['HOS','suci'],'2024-01-01'),
|
||||
|
||||
# ═══ HŠS Šah dodatno ═══
|
||||
('Pravilnik o registraciji igrača HŠS','Registracija šahista, FIDE rating, transferi.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','registracija','FIDE'],'2024-01-01'),
|
||||
('Pravilnik o sucima HŠS','Licenciranje šahovskih sudaca.','pravilnik_savez','Savez','HŠS','šah',None,'https://hrvatski-sahovski-savez.hr/dokumenti-savez/',['HŠS','suci'],'2024-01-01'),
|
||||
|
||||
# ═══ EU REGULATIVE ═══
|
||||
('GDPR (Uredba EU 2016/679)','Opća uredba o zaštiti osobnih podataka — primjenjuje se na sportske klubove (članovi, sportaši).','zakon','EU','EU Komisija',None,'OJ L 119, 4.5.2016','https://gdpr-info.eu/',['GDPR','zaštita podataka','EU','privatnost'],'2016-04-27'),
|
||||
('Zakon o provedbi Opće uredbe o zaštiti podataka','HR implementacija GDPR.','zakon','RH','Sabor RH',None,'NN 42/18','https://www.zakon.hr/z/941/Zakon-o-provedbi-Op%C4%87e-uredbe-o-za%C5%A1titi-podataka',['GDPR','privatnost','HR'],'2018-04-25'),
|
||||
('European Charter on Sport for All','CoE povelja o sportu za sve.','pravilnik','EU','Council of Europe',None,None,'https://rm.coe.int/european-charter-on-sport-for-all',['CoE','sport za sve'],'1976-09-24'),
|
||||
('UNESCO International Charter of Physical Education and Sport','UNESCO međunarodna povelja.','pravilnik','EU','UNESCO',None,None,'https://en.unesco.org/charter-of-physical-education-and-sport',['UNESCO','tjelesni odgoj'],'2015-11-17'),
|
||||
('Erasmus+ Sport Programme Guide','Vodič za EU programe sufinanciranja sporta — small partnerships, cooperation partnerships.','program','EU','Europska komisija',None,None,'https://erasmus-plus.ec.europa.eu/programme-guide',['Erasmus+','EU','suradnja','grant'],'2024-01-01'),
|
||||
('Council Conclusions on EU Work Plan for Sport 2024-2027','Plan rada EU za sport.','strategija','EU','Vijeće EU',None,None,'https://www.consilium.europa.eu/sport',['EU','plan rada','2024-2027'],'2023-12-01'),
|
||||
|
||||
# ═══ INFRASTRUKTURNI / OBJEKTI ═══
|
||||
('Pravilnik o sportskim objektima','Tehnički standardi za sportske objekte — dvorane, igrališta, bazeni.','pravilnik','RH','MTS',None,None,'https://mtus.gov.hr/sport-objekti',['objekt','dvorana','bazen','tehnički'],'2023-01-01'),
|
||||
('Pravilnik o sigurnosti na bazenima','Sigurnosni standardi za bazene i plivačka natjecanja.','pravilnik','RH','MTS + MZ',None,None,'https://mtus.gov.hr/bazeni-sigurnost',['bazen','sigurnost','plivanje'],'2022-01-01'),
|
||||
|
||||
# ═══ ŠKOLSKI SPORT DODATNO ═══
|
||||
('Nacionalni kurikulum tjelesne i zdravstvene kulture','Kurikulum predmeta TZK u OŠ i SŠ.','program','RH','MZO',None,'NN 7/19','https://mzo.gov.hr/UserDocsImages/dokumenti/Obrazovanje/NacionalniKurikulum/predmetni-kurikulumi/Tjelesna%20i%20zdravstvena%20kultura.pdf',['kurikulum','TZK','škola','MZO'],'2019-01-22'),
|
||||
('Pravilnik o organizaciji i provedbi školskih sportskih natjecanja','ŠŠD natjecanja, županijska, državna razina.','pravilnik','RH','MZO',None,None,'https://mzo.gov.hr/sport/skolska-natjecanja',['ŠŠD','natjecanje','MZO','škola'],'2022-01-01'),
|
||||
('Sustav školskih sportskih društava (SŠSD)','Organizacija školskih klubova u OŠ i SŠ.','program','RH','MZO + Hrvatski školski sportski savez',None,None,'https://hssd.hr/dokumenti',['ŠŠD','HŠŠS','organizacija'],'2023-01-01'),
|
||||
|
||||
# ═══ MEDICINA SPORTA ═══
|
||||
('Pravilnik o sportskoj medicinskoj zaštiti','Liječnička zaštita sportaša — specijalisti, prevencija ozljeda.','pravilnik','RH','MZ + HASMS',None,None,'https://hasms.hr/dokumenti/',['medicina sporta','liječnik','HASMS','prevencija'],'2023-01-01'),
|
||||
('Pravilnik o sportskoj prehrani i suplementaciji','Smjernice za prehranu i dopuste u suplementaciji.','pravilnik','RH','HASMS',None,None,'https://hasms.hr/dokumenti/',['prehrana','suplementi','HASMS'],'2022-01-01'),
|
||||
|
||||
# ═══ FINANCIJE / RAČUNOVODSTVO ═══
|
||||
('Pravilnik o neprofitnom računovodstvu','Računovodstvene obveze sportskih udruga.','pravilnik','RH','Min. financija',None,'NN 1/15, 25/17, 96/18, 103/18, 81/19','https://www.zakon.hr/z/810/Pravilnik-o-neprofitnom-ra%C4%8Dunovodstvu-i-ra%C4%8Dunskom-planu',['neprofitno','računovodstvo','udruga'],'2015-01-08'),
|
||||
('Zakon o financijskom poslovanju i računovodstvu neprofitnih organizacija','Temeljni propis za neprofit.','zakon','RH','Sabor RH',None,'NN 121/14','https://www.zakon.hr/z/672/Zakon-o-financijskom-poslovanju-i-ra%C4%8Dunovodstvu-neprofitnih-organizacija',['neprofit','financije','udruga'],'2014-10-17'),
|
||||
|
||||
# ═══ JAVNA NABAVA SPORT ═══
|
||||
('Zakon o javnoj nabavi','Postupci javne nabave za sportske objekte i opremu.','zakon','RH','Sabor RH',None,'NN 120/16, 114/22','https://www.zakon.hr/z/223/Zakon-o-javnoj-nabavi',['javna nabava','postupak','sport'],'2016-12-29'),
|
||||
|
||||
# ═══ SPECIFIČNI SPORTOVI dodatno ═══
|
||||
('Pravila igre Vaterpolo (FINA Rules)','FINA pravila vaterpola.','pravilnik','EU','World Aquatics',None,None,'https://www.worldaquatics.com/rules/water-polo',['FINA','vaterpolo','pravila'],'2023-01-01'),
|
||||
('Pravila plivanja FINA','FINA pravila plivanja.','pravilnik','EU','World Aquatics',None,None,'https://www.worldaquatics.com/rules/swimming',['FINA','plivanje'],'2023-01-01'),
|
||||
('Pravila atletike (World Athletics)','WA tehnička pravila atletike.','pravilnik','EU','World Athletics',None,None,'https://worldathletics.org/about-iaaf/documents/book-of-rules',['World Athletics','atletika'],'2024-01-01'),
|
||||
('Pravila tenisa (ITF Rules of Tennis)','ITF Rules of Tennis.','pravilnik','EU','ITF','tenis',None,'https://www.itftennis.com/en/about-us/governance/rules-and-regulations/',['ITF','tenis'],'2024-01-01'),
|
||||
('Pravila stolnog tenisa (ITTF)','ITTF Handbook.','pravilnik','EU','ITTF','stolni tenis',None,'https://www.ittf.com/handbook/',['ITTF','stolni tenis'],'2024-01-01'),
|
||||
('Pravila biciklizma (UCI)','UCI Cycling Regulations.','pravilnik','EU','UCI','biciklizam',None,'https://www.uci.org/inside-uci/constitutions-regulations/regulations',['UCI','biciklizam'],'2024-01-01'),
|
||||
('Pravila judoa (IJF)','IJF Rules.','pravilnik','EU','IJF','judo',None,'https://www.ijf.org/ijf/documents',['IJF','judo'],'2023-01-01'),
|
||||
('Pravila taekwondoa (WT)','World Taekwondo Competition Rules.','pravilnik','EU','World Taekwondo','taekwondo',None,'https://www.worldtaekwondo.org/rules/',['WT','taekwondo'],'2024-01-01'),
|
||||
('Pravila karatea (WKF)','WKF Kumite/Kata pravila.','pravilnik','EU','WKF','karate',None,'https://www.wkf.net/structure/wkf-rules',['WKF','karate'],'2024-01-01'),
|
||||
('Pravila boćanja CBI/CMSB','Confederazione Boccistica Internazionale.','pravilnik','EU','CBI','boćanje',None,'https://www.cbi-bocce.com/regolamenti',['CBI','boćanje','volo','raffa'],'2023-01-01'),
|
||||
('Pravila jedrenja (RRS World Sailing 2025-2028)','Racing Rules of Sailing.','pravilnik','EU','World Sailing','jedriličarstvo',None,'https://www.sailing.org/rrs',['RRS','jedrenje','World Sailing'],'2024-12-01'),
|
||||
('Pravila veslanja (FISA/World Rowing)','FISA pravila regata.','pravilnik','EU','World Rowing','veslanje',None,'https://worldrowing.com/about/world-rowing/rules-of-racing',['World Rowing','FISA','veslanje'],'2023-01-01'),
|
||||
('Pravila šaha (FIDE Laws of Chess)','FIDE Handbook.','pravilnik','EU','FIDE','šah',None,'https://handbook.fide.com/',['FIDE','šah'],'2023-07-01'),
|
||||
|
||||
# ═══ KLUBSKE LICENCE / TURNIRI ═══
|
||||
('Pravilnik o organizaciji međunarodnih sportskih natjecanja u RH','Postupak organiziranja međunarodnih turnira u RH (HOO suglasnost, sigurnost).','pravilnik','RH','MTS + HOO',None,None,'https://mtus.gov.hr/medunarodna-natjecanja',['međunarodno','turnir','organizacija'],'2023-01-01'),
|
||||
|
||||
# ═══ POSEBNO PGŽ + GRAD RIJEKA NOVIJI ═══
|
||||
('Proračun PGŽ — sport 2026','Proračunska sredstva za sport u PGŽ proračunu 2026.','plan','PGZ','PGŽ',None,None,'https://www.pgz.hr/proracun-2026',['proračun','PGŽ','2026','sport'],'2025-12-01'),
|
||||
('Proračun Grada Rijeke — sport 2026','Proračunska sredstva za sport u Rijeci 2026.','plan','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/proracun-2026',['proračun','Rijeka','2026'],'2025-12-01'),
|
||||
('Plan razvoja Grada Rijeke 2024-2030','Gradski razvojni plan — sport kao strateška mjera.','strategija','Grad Rijeka','Grad Rijeka',None,None,'https://www.rijeka.hr/plan-razvoja',['Rijeka','razvoj','2024-2030'],'2024-01-01'),
|
||||
('Strategija "Sport za sve" PGŽ','Program rekreativnog sporta i tjelesne aktivnosti građana.','strategija','PGZ','PGŽ',None,None,'https://www.pgz.hr/sport-za-sve',['rekreacija','sport za sve','PGŽ'],'2023-01-01'),
|
||||
('Pravilnik o radu Sportskih škola PGŽ','Organizacija škola sporta za djecu — sufinanciranje.','pravilnik','PGZ','PGŽ',None,None,'https://www.pgz.hr/skole-sporta',['škola sporta','djeca','PGŽ'],'2023-01-01'),
|
||||
('Akcijski plan promidžbe sporta PGŽ','Marketing i promocija sporta u županiji.','program','PGZ','PGŽ',None,None,'https://www.pgz.hr/promidzba-sport',['promidžba','marketing','PGŽ'],'2023-01-01'),
|
||||
|
||||
# ═══ ZAŠTITA ═══
|
||||
('Zakon o zaštiti od nasilja u obitelji — sport','Sportski klubovi kao mjesto prijave nasilja u obitelji.','zakon','RH','Sabor RH',None,'NN 70/17, 126/19, 84/21','https://www.zakon.hr/z/977/Zakon-o-za%C5%A1titi-od-nasilja-u-obitelji',['nasilje','zaštita','obitelj'],'2017-07-21'),
|
||||
('Sportski etički kodeks RH','Nacionalni etički kodeks — sport bez korupcije, fair play.','pravilnik','RH','HOO + MTS',None,None,'https://www.hoo.hr/hr-hr/o-hoo-u/etika',['etika','fair play','korupcija'],'2022-01-01'),
|
||||
|
||||
# ═══ EDUKACIJA ═══
|
||||
('Erasmus+ Sport — Hrvatska iskustva','Sažetak iskusnih projekta iz HR (Erasmus Sport).','erasmus','EU','Europska komisija',None,None,'https://erasmus-plus.ec.europa.eu/sport',['Erasmus+','Hrvatska','EU'],'2024-01-01'),
|
||||
('Akademski sport u RH (HASS)','Pravilnik Hrvatskog akademskog sportskog saveza.','pravilnik','RH','HASS',None,None,'https://hass.hr/dokumenti',['akademski','HASS','sveučilište'],'2022-01-01'),
|
||||
('Pravilnik o studentskim sportskim natjecanjima','Univerzitetska natjecanja.','pravilnik','RH','HASS',None,None,'https://hass.hr/natjecanja',['studenti','natjecanje','HASS'],'2022-01-01'),
|
||||
]
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
cu.execute("SELECT LOWER(COALESCE(title, '')) FROM pgz_sport.dokumenti")
|
||||
existing = set(r[0] for r in cu.fetchall())
|
||||
|
||||
n_added = 0; n_skip = 0
|
||||
for d in DOCS:
|
||||
if d[0].lower() in existing:
|
||||
n_skip += 1; continue
|
||||
try:
|
||||
cu.execute('''INSERT INTO pgz_sport.dokumenti
|
||||
(title, kratak_opis, vrsta, razina, organizacija, sport, sluzbeni_glasnik,
|
||||
izvor_url, kljucne_rijeci, izdano_datum, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true)''', d)
|
||||
n_added += 1
|
||||
existing.add(d[0].lower())
|
||||
except Exception as e:
|
||||
print(f' err {d[0][:40]}: {e}')
|
||||
|
||||
print(f'Added: {n_added}, Skipped: {n_skip}')
|
||||
cu.execute('SELECT count(*) FROM pgz_sport.dokumenti')
|
||||
print(f'TOTAL: {cu.fetchone()[0]}')
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: klub_oib_enricher.py | v1.0.0 | 04.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scrapers/klub_oib_enricher.py
|
||||
# Svrha: Enrichment OIB-a za pgz_sport.klubovi (678 BEZ OIB)
|
||||
# Strategy: 1) match s civic.entities by naziv → kopiraj OIB
|
||||
# 2) DDG search za kluba s "OIB" i extract
|
||||
# 3) Sudreg API lookup po naziv (cache)
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""OIB enrichment za PGŽ Sport klubove."""
|
||||
import os, sys, time, re, hashlib, logging, json
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [klub_oib] %(message)s')
|
||||
log = logging.getLogger("klub_oib")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def db():
|
||||
return psycopg2.connect(DSN, cursor_factory=RealDictCursor)
|
||||
|
||||
def normalize_name(n):
|
||||
"""Normalize club name for matching."""
|
||||
if not n: return ""
|
||||
n = n.lower().strip()
|
||||
n = re.sub(r'\s+', ' ', n)
|
||||
# remove common prefixes/suffixes
|
||||
n = re.sub(r'\b(klub|udruga|sportski|nk|hk|kk|bk|ok|rk|jk|šk|sk|tk)\b', '', n)
|
||||
return n.strip()
|
||||
|
||||
def match_civic_entities(cur, naziv):
|
||||
"""Try to match by name in civic.entities."""
|
||||
if not naziv or len(naziv) < 5: return None
|
||||
|
||||
norm = normalize_name(naziv)
|
||||
if not norm: return None
|
||||
|
||||
# Try exact match prvo
|
||||
cur.execute("""
|
||||
SELECT oib, name AS naziv, county AS county_code FROM civic.entities
|
||||
WHERE oib IS NOT NULL AND length(oib) = 11
|
||||
AND lower(name) ILIKE %s
|
||||
LIMIT 5
|
||||
""", (f'%{naziv[:30].lower()}%',))
|
||||
candidates = cur.fetchall()
|
||||
|
||||
if not candidates:
|
||||
# Try fuzzy
|
||||
words = [w for w in norm.split() if len(w) > 3][:3]
|
||||
if words:
|
||||
for word in words:
|
||||
cur.execute("""
|
||||
SELECT oib, name AS naziv FROM civic.entities
|
||||
WHERE oib IS NOT NULL AND length(oib) = 11
|
||||
AND lower(name) ILIKE %s
|
||||
LIMIT 3
|
||||
""", (f'%{word}%',))
|
||||
c = cur.fetchall()
|
||||
if c:
|
||||
candidates = c
|
||||
break
|
||||
|
||||
if not candidates: return None
|
||||
|
||||
# Score candidates by name similarity
|
||||
best = None
|
||||
best_score = 0
|
||||
naziv_norm = normalize_name(naziv)
|
||||
for c in candidates:
|
||||
cand_norm = normalize_name(c['naziv'])
|
||||
if not cand_norm: continue
|
||||
# Simple word overlap
|
||||
w1 = set(naziv_norm.split())
|
||||
w2 = set(cand_norm.split())
|
||||
if not w1 or not w2: continue
|
||||
overlap = len(w1 & w2) / max(len(w1), len(w2))
|
||||
if overlap > best_score and overlap >= 0.6:
|
||||
best_score = overlap
|
||||
best = c
|
||||
|
||||
return best
|
||||
|
||||
def enrich():
|
||||
conn = db()
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get klubovi without OIB
|
||||
cur.execute("""
|
||||
SELECT id, naziv, savez_id, sport, grad
|
||||
FROM pgz_sport.klubovi
|
||||
WHERE (oib IS NULL OR oib = '' OR length(oib) != 11)
|
||||
ORDER BY id
|
||||
""")
|
||||
klubovi = cur.fetchall()
|
||||
log.info(f"Klubovi za enrichment: {len(klubovi)}")
|
||||
|
||||
enriched = 0
|
||||
no_match = 0
|
||||
|
||||
for k in klubovi:
|
||||
match = match_civic_entities(cur, k['naziv'])
|
||||
if match:
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.klubovi
|
||||
SET oib = %s, updated_at = now()
|
||||
WHERE id = %s AND (oib IS NULL OR oib = '')
|
||||
""", (match['oib'], k['id']))
|
||||
log.info(f"✓ {k['naziv'][:40]} → OIB {match['oib']} (matched: {match['naziv'][:40]})")
|
||||
enriched += 1
|
||||
else:
|
||||
no_match += 1
|
||||
|
||||
if (enriched + no_match) % 50 == 0:
|
||||
log.info(f"Progress: {enriched} enriched / {no_match} no_match / {enriched+no_match}/{len(klubovi)}")
|
||||
|
||||
log.info(f"FINAL: {enriched} enriched, {no_match} no_match")
|
||||
cur.close()
|
||||
conn.close()
|
||||
return enriched
|
||||
|
||||
if __name__ == "__main__":
|
||||
enrich()
|
||||
Executable
+203
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env python3
|
||||
"""LLM mining v6 - multi-godina, persist to clan_nagrada, name validation."""
|
||||
import json, re, psycopg2, requests, time, os, sys
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
OLLAMA = "http://localhost:11434/api/generate"
|
||||
MODEL = "qwen2.5:7b"
|
||||
|
||||
PROMPT_TPL = """Iz teksta godišnjaka sporta ekstraktiraj sve KONKRETNE rezultate sportaša. Vrati ISKLJUČIVO valjani JSON niz.
|
||||
|
||||
Format jednog objekta:
|
||||
{"sportas":"Ime Prezime","klub":"AK Kvarner","sport":"atletika","godina":2024,"natjecanje":"PH","disciplina":"100m","plasman":1,"medalja":"zlato"}
|
||||
|
||||
KAKO ČITATI:
|
||||
- "1. mjesto" → plasman=1, medalja="zlato"
|
||||
- "2. mjesto" → plasman=2, medalja="srebro"
|
||||
- "3. mjesto" → plasman=3, medalja="bronca"
|
||||
- "4./5./6./7./8. mjesto" → plasman=N, medalja=null
|
||||
- Ako tekst eksplicitno kaže "zlato/srebro/bronca", uzmi to
|
||||
|
||||
PRAVILA:
|
||||
- Vrati [] ako nema konkretnih rezultata
|
||||
- Sport mora biti pravi naziv: atletika, plivanje, košarka, rukomet, vaterpolo, nogomet, jedrenje, biciklizam, šah, taekwondo, karate, gimnastika, tenis, judo, streljaštvo, boćanje, kuglanje, veslanje, stolni tenis, plesovi itd.
|
||||
- "klub" - ekstraktiraj iz teksta ako je naveden
|
||||
- BEZ markdowna, BEZ ```, samo JSON niz
|
||||
- Ne izmišljaj imena - ako nije sigurno, preskoči
|
||||
|
||||
TEKST:
|
||||
___TXT___"""
|
||||
|
||||
KW = re.compile(r'(zlat|srebr|bronc|prvenstv|prvak|EP\b|SP\b|olimp|medalj|svjetsk|europsk|\d\.\s*mjest)', re.IGNORECASE)
|
||||
NAMES_RE = re.compile(r'\b([A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+\s+[A-ZČĆĐŠŽ][a-zčćđšžćžšđč]+)\b')
|
||||
# Pre-filter junk - non-name patterns that LLM can pick up
|
||||
NAME_BLACKLIST = re.compile(r'^(Sport Psychology|Endurance|HNS|HOO|HRS|HKS|MSO|HSS|SSP|TVS|HTS|RNK|ZSU|DVD|Liga HSS|HRP|Vikend|Posto|Visin|Razin|Klub|Sportski|Nogomet|Adresa|Email|Web|Tel|Mob|Fax)', re.IGNORECASE)
|
||||
|
||||
def find_chunks(sadrzaj, chunk_size=2200, overlap=200):
|
||||
chunks = []
|
||||
i = 0
|
||||
while i < len(sadrzaj):
|
||||
chunk = sadrzaj[i:i+chunk_size]
|
||||
if KW.search(chunk) and len(NAMES_RE.findall(chunk)) >= 2:
|
||||
chunks.append((i, chunk))
|
||||
i += chunk_size - overlap
|
||||
return chunks
|
||||
|
||||
def call_llm(text):
|
||||
try:
|
||||
rsp = requests.post(OLLAMA, json={
|
||||
"model": MODEL,
|
||||
"prompt": PROMPT_TPL.replace("___TXT___", text),
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.0, "num_predict": 3000, "num_ctx": 4096}
|
||||
}, timeout=180)
|
||||
if rsp.status_code != 200: return None
|
||||
out = rsp.json().get('response', '').strip()
|
||||
out = re.sub(r'^```(?:json)?\s*', '', out)
|
||||
out = re.sub(r'\s*```$', '', out)
|
||||
m = re.search(r'\[\s*[\{\]]', out)
|
||||
if not m: return []
|
||||
t = out[m.start():]
|
||||
try: return json.loads(t)
|
||||
except json.JSONDecodeError:
|
||||
for end in range(len(t)-1, max(len(t)-3000, 0), -50):
|
||||
if t[end] == ']':
|
||||
try: return json.loads(t[:end+1])
|
||||
except: pass
|
||||
return []
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def insert_fact(f, godina_godisnjaka, doc_id):
|
||||
"""Insert into clan_nagrada with dedup."""
|
||||
sportas = f.get('sportas','').strip()
|
||||
if not sportas or len(sportas) < 4: return None
|
||||
if NAME_BLACKLIST.match(sportas): return 'blacklisted'
|
||||
if not f.get('plasman'): return None
|
||||
|
||||
plasman = f.get('plasman')
|
||||
medalja = f.get('medalja')
|
||||
if plasman not in (1,2,3,4,5,6,7,8): return None
|
||||
|
||||
# Auto-fix medalja
|
||||
if plasman == 1 and not medalja: medalja = 'zlato'
|
||||
elif plasman == 2 and not medalja: medalja = 'srebro'
|
||||
elif plasman == 3 and not medalja: medalja = 'bronca'
|
||||
|
||||
god = f.get('godina') or godina_godisnjaka
|
||||
natj = (f.get('natjecanje') or 'unknown')[:200]
|
||||
disc = (f.get('disciplina') or 'unknown')[:200]
|
||||
|
||||
# Try to match clan_id by ime+prezime
|
||||
parts = sportas.split(None, 1)
|
||||
clan_id = None
|
||||
if len(parts) == 2:
|
||||
cu.execute("""SELECT id FROM pgz_sport.clanovi
|
||||
WHERE LOWER(ime||' '||prezime) = LOWER(%s)
|
||||
OR LOWER(prezime||' '||ime) = LOWER(%s)
|
||||
LIMIT 1""", (sportas, sportas))
|
||||
rec = cu.fetchone()
|
||||
if rec: clan_id = rec[0]
|
||||
|
||||
# Try to match klub_id
|
||||
klub_naziv = (f.get('klub') or '').strip()
|
||||
klub_id = None
|
||||
if klub_naziv and len(klub_naziv) >= 4:
|
||||
cu.execute("""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE LOWER(naziv) = LOWER(%s)
|
||||
OR LOWER(naziv) LIKE LOWER(%s)
|
||||
ORDER BY
|
||||
CASE WHEN aktivan THEN 0 ELSE 1 END,
|
||||
CASE WHEN region='PGŽ' THEN 0 ELSE 1 END,
|
||||
id ASC LIMIT 1""",
|
||||
(klub_naziv, f"%{klub_naziv}%"))
|
||||
rec = cu.fetchone()
|
||||
if rec: klub_id = rec[0]
|
||||
|
||||
try:
|
||||
cu.execute("""INSERT INTO pgz_sport.clan_nagrada
|
||||
(clan_id, ime_prezime, klub_id, klub_naziv, godina, natjecanje,
|
||||
disciplina, plasman, medalja, source, source_url, last_updated)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'llm_godisnjak', %s, now())
|
||||
ON CONFLICT ON CONSTRAINT clan_nagrada_uniq DO UPDATE SET
|
||||
clan_id = COALESCE(pgz_sport.clan_nagrada.clan_id, EXCLUDED.clan_id),
|
||||
klub_id = COALESCE(pgz_sport.clan_nagrada.klub_id, EXCLUDED.klub_id),
|
||||
last_updated = now()""",
|
||||
(clan_id, sportas, klub_id, klub_naziv or None, god, natj, disc, plasman, medalja,
|
||||
f"dokument:{doc_id}"))
|
||||
return 'inserted'
|
||||
except Exception as e:
|
||||
return f'err:{e}'
|
||||
|
||||
GODINA_LIST = [int(x) for x in (sys.argv[1].split(',') if len(sys.argv) > 1 else ['2024'])]
|
||||
LIMIT = int(os.environ.get('LLM_LIMIT', 50))
|
||||
|
||||
total_inserted = 0
|
||||
total_blacklisted = 0
|
||||
total_facts = 0
|
||||
t_global = time.time()
|
||||
|
||||
for godina in GODINA_LIST:
|
||||
cu.execute("SELECT id, sadrzaj FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' AND godina=%s LIMIT 1", (godina,))
|
||||
rec = cu.fetchone()
|
||||
if not rec:
|
||||
print(f"=== Nema {godina} ==="); continue
|
||||
did, sadrzaj = rec
|
||||
|
||||
print(f"\n=== Godišnjak {godina} (id={did}, {len(sadrzaj):,} chars) ===")
|
||||
chunks = find_chunks(sadrzaj)
|
||||
print(f"Chunks: {len(chunks)}, processing {min(LIMIT, len(chunks))}")
|
||||
|
||||
god_inserted = 0
|
||||
god_facts = 0
|
||||
t_god = time.time()
|
||||
|
||||
for idx, (off, chunk) in enumerate(chunks[:LIMIT]):
|
||||
t0 = time.time()
|
||||
facts = call_llm(chunk)
|
||||
el = time.time() - t0
|
||||
if facts is None:
|
||||
print(f" [{idx+1}/{LIMIT}] ERR ({el:.1f}s)", flush=True)
|
||||
continue
|
||||
if not facts: continue
|
||||
|
||||
for f in facts:
|
||||
if not isinstance(f, dict): continue
|
||||
res = insert_fact(f, godina, did)
|
||||
god_facts += 1
|
||||
if res == 'inserted': god_inserted += 1
|
||||
elif res == 'blacklisted': total_blacklisted += 1
|
||||
|
||||
if facts:
|
||||
print(f" [{idx+1}/{LIMIT}] {el:.1f}s {len(facts)} parsed", flush=True)
|
||||
|
||||
print(f"\n=== {godina}: {god_facts} facts, {god_inserted} inserted, {time.time()-t_god:.0f}s ===")
|
||||
total_facts += god_facts
|
||||
total_inserted += god_inserted
|
||||
|
||||
print(f"\n=== TOTAL ({time.time()-t_global:.0f}s): {total_facts} parsed, {total_inserted} inserted, {total_blacklisted} blacklisted ===")
|
||||
|
||||
# Stats
|
||||
cu.execute("""SELECT godina, count(*) FROM pgz_sport.clan_nagrada
|
||||
WHERE source='llm_godisnjak' GROUP BY godina ORDER BY godina""")
|
||||
print("\n=== LLM clan_nagrada by year ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0]}: {r[1]}")
|
||||
|
||||
cu.execute("""SELECT count(*) FILTER (WHERE clan_id IS NOT NULL),
|
||||
count(*) FILTER (WHERE clan_id IS NULL),
|
||||
count(*) FILTER (WHERE klub_id IS NOT NULL),
|
||||
count(*)
|
||||
FROM pgz_sport.clan_nagrada WHERE source='llm_godisnjak'""")
|
||||
r = cu.fetchone()
|
||||
print(f"\nLinking stats: {r[0]} clan_id linked, {r[1]} unlinked, {r[2]} klub_id linked of {r[3]} total")
|
||||
|
||||
# Audit
|
||||
cu.execute("""INSERT INTO pgz_sport.audit_feed
|
||||
(table_name, action, source, source_url, details)
|
||||
VALUES ('clan_nagrada', 'llm_mining_v6_db', 'qwen2.5:7b', NULL, %s::jsonb)""",
|
||||
(json.dumps({"godine": GODINA_LIST, "facts": total_facts, "inserted": total_inserted, "chunk_limit": LIMIT}),))
|
||||
Executable
+7
@@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
while true; do
|
||||
python3 /opt/pgz-sport/scrapers/klub_oib_enricher.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/clan_oib_enricher.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/sudreg_klub_search.py 2>&1 | tail -2
|
||||
sleep 600
|
||||
done
|
||||
Executable
+8
@@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
while true; do
|
||||
echo "[$(date)] PGŽ deep cycle"
|
||||
python3 /opt/pgz-sport/scrapers/pgz_sport_deep.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/rijeka_sport_scraper.py 2>&1 | tail -2
|
||||
python3 /opt/pgz-sport/scrapers/rss_rijeka_scraper.py 2>&1 | tail -2
|
||||
sleep 1800
|
||||
done
|
||||
@@ -0,0 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
# pgz_sport_deep.py — Deep scrape sport-pgz.hr + pgz.hr/sport
|
||||
import os, sys, time, hashlib, logging, re
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [pgz_deep] %(message)s')
|
||||
log = logging.getLogger("pgz_deep")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
ROOTS = [
|
||||
"https://sport-pgz.hr",
|
||||
"https://www.pgz.hr/teme/sport/",
|
||||
"https://www.pgz.hr/sport/",
|
||||
"https://www.pgz.hr/o-zupaniji/upravna-tijela/upravni-odjel-za-kulturu-sport-tehnicku-kulturu/",
|
||||
]
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode('utf-8', errors='replace'), r.status
|
||||
except Exception as e:
|
||||
time.sleep(3*(i+1))
|
||||
return None, 0
|
||||
|
||||
def extract_text(html):
|
||||
if not html: return ""
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S|re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = unescape(text)
|
||||
return re.sub(r'\s+', ' ', text).strip()
|
||||
|
||||
def find_links(html, base):
|
||||
if not html: return []
|
||||
out = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
u = urljoin(base, m.group(1))
|
||||
host = urlparse(u).hostname or ""
|
||||
if any(d in host for d in ['pgz.hr', 'sport-pgz.hr']):
|
||||
out.append(u)
|
||||
return list(set(out))
|
||||
|
||||
def find_pdf_links(html, base):
|
||||
if not html: return []
|
||||
out = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+\.pdf)["\']', html, re.I):
|
||||
out.append(urljoin(base, m.group(1)))
|
||||
return list(set(out))
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
visited = set()
|
||||
queue = list(ROOTS)
|
||||
docs = 0
|
||||
facts = 0
|
||||
pdfs_logged = 0
|
||||
|
||||
while queue and len(visited) < 300:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200:
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
log.info(f"[{status}] {url[:80]} ({len(html)} bytes)")
|
||||
text = extract_text(html)
|
||||
if len(text) < 100: continue
|
||||
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
||||
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'PGŽ'))
|
||||
docs += cur.rowcount
|
||||
except Exception as e:
|
||||
log.warning(f"Doc insert fail: {e}")
|
||||
|
||||
# PDF links — log them
|
||||
for pdf_url in find_pdf_links(html, url):
|
||||
try:
|
||||
pdf_sha = hashlib.sha1(pdf_url.encode()).hexdigest()
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, pdf_url, fname, title, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (pdf_url, pdf_url, pdf_url.split('/')[-1][:100],
|
||||
pdf_url.split('/')[-1][:200], 'pdf', url, pdf_sha, 'PGŽ'))
|
||||
pdfs_logged += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Knowledge facts — sport relevant
|
||||
if any(kw in text.lower() for kw in ['sport', 'klub', 'savez', 'sportaš', 'sportaši', 'natjecanj', 'manifestacij', 'javne potrebe', 'sufinancir', 'kup', 'prvenstvo', 'liga', 'utakm', 'igrač', 'trener', 'olimpij', 'paraolimpij', 'turn', 'medalj', 'pobjed', 'gradonaceln', 'župan', 'rijeka', 'pgž', 'primorsko', 'subvenc', 'natječaj', 'odluka', 'proračun', 'rebal']):
|
||||
# Save chunk as fact
|
||||
chunks = [text[i:i+800] for i in range(0, min(len(text), 5000), 800)]
|
||||
for ci, chunk in enumerate(chunks[:5]):
|
||||
if len(chunk) < 200: continue
|
||||
fact_hash = hashlib.sha256((url + str(ci) + chunk[:100]).encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_url, source_date, confidence, data_hash)
|
||||
VALUES (%s, 'pgz_sport_official', 'pgz_sport_deep', %s, CURRENT_DATE, 0.85, %s)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", (chunk[:1500].replace('\x00', ''), url, fact_hash))
|
||||
facts += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Follow links
|
||||
links = find_links(html, url)
|
||||
for l in links[:25]:
|
||||
if l not in visited and l not in queue:
|
||||
queue.append(l)
|
||||
|
||||
log.info(f"FINAL: visited={len(visited)} docs={docs} pdfs={pdfs_logged} facts={facts}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
# qa_from_sport_facts.py — Generate Q&A pairs from PGZ sport facts za DABI trening
|
||||
import psycopg2, hashlib, logging, json, re
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [qa_gen] %(message)s')
|
||||
log = logging.getLogger("qa_gen")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Klubovi facts → Q&A
|
||||
cur.execute("""
|
||||
SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, godina_osnutka, broj_clanova
|
||||
FROM pgz_sport.klubovi WHERE aktivan = true AND naziv IS NOT NULL
|
||||
LIMIT 1000
|
||||
""")
|
||||
|
||||
qa_rows = []
|
||||
for r in cur.fetchall():
|
||||
kid, naziv, oib, sport, grad, preds, tajn, god, n_cl = r
|
||||
|
||||
# Generate diverse Q-A pairs
|
||||
pairs = []
|
||||
if oib: pairs.append((f"Koji je OIB kluba {naziv}?", f"OIB kluba {naziv} je {oib}."))
|
||||
if sport: pairs.append((f"Kojim sportom se bavi klub {naziv}?", f"Klub {naziv} bavi se sportom: {sport}."))
|
||||
if grad: pairs.append((f"U kojem gradu je klub {naziv}?", f"Klub {naziv} djeluje u gradu {grad}."))
|
||||
if preds: pairs.append((f"Tko je predsjednik kluba {naziv}?", f"Predsjednik kluba {naziv} je {preds}."))
|
||||
if tajn: pairs.append((f"Tko je tajnik kluba {naziv}?", f"Tajnik kluba {naziv} je {tajn}."))
|
||||
if god: pairs.append((f"Kada je osnovan klub {naziv}?", f"Klub {naziv} osnovan je {god}. godine."))
|
||||
if n_cl: pairs.append((f"Koliko članova ima klub {naziv}?", f"Klub {naziv} ima {n_cl} članova."))
|
||||
|
||||
for q, a in pairs:
|
||||
qa_hash = hashlib.sha256(f"{q}".encode()).hexdigest()[:32]
|
||||
qa_rows.append((q, a, 'pgz_sport_klub_qa', 'auto_generated', 0.92, qa_hash))
|
||||
|
||||
# Savezi facts
|
||||
cur.execute("""
|
||||
SELECT id, naziv, oib, sport, predsjednik, tajnik FROM pgz_sport.savezi
|
||||
WHERE naziv IS NOT NULL
|
||||
""")
|
||||
for r in cur.fetchall():
|
||||
sid, naziv, oib, sport, preds, tajn = r
|
||||
if oib: qa_rows.append((f"Koji je OIB saveza {naziv}?", f"OIB saveza {naziv} je {oib}.",
|
||||
'pgz_sport_savez_qa', 'auto_generated', 0.92,
|
||||
hashlib.sha256(f"savez_oib:{sid}".encode()).hexdigest()[:32]))
|
||||
if preds: qa_rows.append((f"Tko je predsjednik {naziv}?", f"Predsjednik {naziv} je {preds}.",
|
||||
'pgz_sport_savez_qa', 'auto_generated', 0.92,
|
||||
hashlib.sha256(f"savez_preds:{sid}".encode()).hexdigest()[:32]))
|
||||
|
||||
# Multi-chair pitanja
|
||||
cur.execute("""
|
||||
WITH all_links AS (
|
||||
SELECT lower(trim(predsjednik)) AS pk, predsjednik AS pname,
|
||||
'klub:'||k.id AS oid, k.naziv AS oname, 'predsjednik' AS role
|
||||
FROM pgz_sport.klubovi k WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
|
||||
UNION ALL
|
||||
SELECT lower(trim(tajnik)), tajnik, 'klub:'||k.id, k.naziv, 'tajnik'
|
||||
FROM pgz_sport.klubovi k WHERE tajnik IS NOT NULL AND length(trim(tajnik)) > 5
|
||||
)
|
||||
SELECT pname, count(DISTINCT oid) AS n,
|
||||
string_agg(DISTINCT oname, ', ') AS klubovi
|
||||
FROM all_links GROUP BY pname HAVING count(DISTINCT oid) >= 3
|
||||
ORDER BY 2 DESC LIMIT 50
|
||||
""")
|
||||
for r in cur.fetchall():
|
||||
pname, n, klubovi = r
|
||||
q = f"U koliko klubova/saveza je {pname} u funkciji?"
|
||||
a = f"{pname} sjedi na {n} stolica u PGŽ Sport. Klubovi: {klubovi[:300]}. Ovo je multi-chair pozicija — moguć sukob interesa."
|
||||
qh = hashlib.sha256(f"mc_qa:{pname}".encode()).hexdigest()[:32]
|
||||
qa_rows.append((q, a, 'pgz_sport_multichair_qa', 'auto_generated', 0.90, qh))
|
||||
|
||||
|
||||
# Manifestacije Q&A
|
||||
cur.execute("SELECT id, naziv, mjesto, godina_od, organizator FROM pgz_sport.manifestacije WHERE naziv IS NOT NULL")
|
||||
for r in cur.fetchall():
|
||||
mid, naziv, mjesto, god, org = r
|
||||
if mjesto: qa_rows.append((f"Gdje se održava manifestacija {naziv}?", f"Manifestacija {naziv} održava se u {mjesto}.",
|
||||
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"man_mjesto:{mid}".encode()).hexdigest()[:32]))
|
||||
if org: qa_rows.append((f"Tko organizira manifestaciju {naziv}?", f"Manifestaciju {naziv} organizira {org}.",
|
||||
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"man_org:{mid}".encode()).hexdigest()[:32]))
|
||||
if god: qa_rows.append((f"Otkad se održava manifestacija {naziv}?", f"Manifestacija {naziv} održava se od {god}. godine.",
|
||||
'pgz_sport_manifestacija_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"man_god:{mid}".encode()).hexdigest()[:32]))
|
||||
|
||||
# Natjecanja Q&A
|
||||
cur.execute("SELECT id, naziv, sport, sezona, razina, tip FROM pgz_sport.natjecanja WHERE naziv IS NOT NULL LIMIT 500")
|
||||
for r in cur.fetchall():
|
||||
nid, naziv, sport, sez, raz, tip = r
|
||||
if sport: qa_rows.append((f"Kojim sportom se bavi natjecanje {naziv}?", f"Natjecanje {naziv} je u sportu {sport}.",
|
||||
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"nat_sport:{nid}".encode()).hexdigest()[:32]))
|
||||
if raz: qa_rows.append((f"Koja je razina natjecanja {naziv}?", f"Natjecanje {naziv} je razine {raz}.",
|
||||
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"nat_raz:{nid}".encode()).hexdigest()[:32]))
|
||||
if sez: qa_rows.append((f"U kojoj sezoni je natjecanje {naziv}?", f"Natjecanje {naziv} je sezona {sez}.",
|
||||
'pgz_sport_natjecanje_qa', 'auto_generated', 0.85,
|
||||
hashlib.sha256(f"nat_sez:{nid}".encode()).hexdigest()[:32]))
|
||||
|
||||
# Save to dabi.training_qa
|
||||
inserted = 0
|
||||
for qa in qa_rows:
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.training_qa
|
||||
(question, answer, category, source_type, created_at)
|
||||
VALUES (%s, %s, %s, %s, now())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (qa[0], qa[1], qa[2], 'pgz_sport_auto'))
|
||||
inserted += cur.rowcount
|
||||
except Exception as e:
|
||||
if inserted < 3:
|
||||
log.warning(f"insert fail: {e}")
|
||||
|
||||
log.info(f"Q&A pairs generated: {len(qa_rows)}, inserted: {inserted}")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
# rijeka_sport_scraper.py — sport.rijeka.hr + rijeka.hr/sport
|
||||
import sys, os
|
||||
sys.path.insert(0, '/opt/pgz-sport/scrapers')
|
||||
from pgz_sport_deep import harvest as base_harvest, fetch, extract_text, find_links, find_pdf_links
|
||||
import logging
|
||||
logging.getLogger().handlers.clear()
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [rijeka_sport] %(message)s')
|
||||
|
||||
# Override roots
|
||||
import pgz_sport_deep
|
||||
pgz_sport_deep.ROOTS = [
|
||||
"https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/",
|
||||
"https://www.rijeka.hr/sport/",
|
||||
"https://sport.rijeka.hr",
|
||||
"https://www.rijekasport.hr",
|
||||
]
|
||||
|
||||
if __name__ == "__main__":
|
||||
pgz_sport_deep.harvest()
|
||||
@@ -0,0 +1,145 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# Fajl: rss_rijeka_scraper.py | v1.0.0 | 04.05.2026
|
||||
# Autor: Damir Radulić <dradulic@outlook.com>
|
||||
# Lokacija: /opt/pgz-sport/scrapers/rss_rijeka_scraper.py
|
||||
# Svrha: RSS / Zajednica sportskih udruga grada Rijeke deep scraper
|
||||
# Cilj: financijski izvještaji, klubovi, sportaši, dokumenti
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""RSS Rijeka scraper — klubovi, financiranje, dokumenti."""
|
||||
import os, sys, time, hashlib, logging, re, json
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [rss] %(message)s')
|
||||
log = logging.getLogger("rss")
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Civic Intelligence Bot 1.0; contact: dradulic@outlook.com)"
|
||||
|
||||
# Probe potential domains
|
||||
RSS_DOMAINS = [
|
||||
"https://rijeckisportskisavez.hr",
|
||||
"https://www.zsus-rijeka.hr",
|
||||
"https://sport.rijeka.hr",
|
||||
"https://rss-rijeka.hr",
|
||||
"https://www.rijeka.hr/teme-za-gradane/sport-i-rekreacija/"
|
||||
]
|
||||
|
||||
def fetch(url, retries=3):
|
||||
for i in range(retries):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
content = r.read().decode('utf-8', errors='replace')
|
||||
time.sleep(2.0)
|
||||
return content, r.status
|
||||
except Exception as e:
|
||||
log.warning(f"Fetch fail {i+1}: {url} {e}")
|
||||
time.sleep(3 * (i+1))
|
||||
return None, 0
|
||||
|
||||
def find_links(html, base_url):
|
||||
"""Extract internal links and PDFs."""
|
||||
if not html: return []
|
||||
links = []
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
url = m.group(1)
|
||||
if url.startswith('#') or url.startswith('mailto:') or url.startswith('javascript:'):
|
||||
continue
|
||||
full = urljoin(base_url, url)
|
||||
try:
|
||||
host = urlparse(full).hostname or ""
|
||||
if any(d in host for d in ['rijeckisportskisavez', 'zsus-rijeka', 'rijeka.hr', 'rss-rijeka']):
|
||||
links.append(full)
|
||||
except: pass
|
||||
return list(set(links))
|
||||
|
||||
def extract_text(html):
|
||||
"""Strip HTML tags."""
|
||||
if not html: return ""
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S | re.I)
|
||||
text = re.sub(r'<style[^>]*>.*?</style>', '', text, flags=re.S | re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = unescape(text)
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
|
||||
def extract_oibs(text):
|
||||
"""Find OIB numbers in text."""
|
||||
return re.findall(r'\b(\d{11})\b', text)
|
||||
|
||||
def extract_money(text):
|
||||
"""Find EUR amounts."""
|
||||
return re.findall(r'(\d{1,3}(?:[.,]\d{3})+(?:[.,]\d{2})?)\s*(?:EUR|€|kn|HRK)', text)
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
visited = set()
|
||||
queue = list(RSS_DOMAINS)
|
||||
docs_inserted = 0
|
||||
facts_inserted = 0
|
||||
|
||||
while queue and len(visited) < 200:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200: continue
|
||||
|
||||
log.info(f"[{status}] {url} ({len(html)} bytes)")
|
||||
|
||||
text = extract_text(html)
|
||||
if len(text) < 100: continue
|
||||
|
||||
# Insert dokument
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
|
||||
sha1 = hashlib.sha1(text.encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, now(), %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, 'RSS Rijeka'))
|
||||
docs_inserted += cur.rowcount
|
||||
except Exception as e:
|
||||
log.warning(f"Insert fail: {e}")
|
||||
|
||||
# Extract OIBs and create facts
|
||||
oibs = set(extract_oibs(text))
|
||||
for oib in oibs:
|
||||
if not oib.startswith('0000'):
|
||||
fact = f"OIB {oib} pojavljuje se na RSS Rijeka stranici: {title[:100]}"
|
||||
fact_hash = hashlib.sha256((url + fact).encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_url, source_date, confidence, data_hash)
|
||||
VALUES (%s, 'rss_rijeka', 'rss_rijeka_scraper', %s, CURRENT_DATE, 0.75, %s)
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", (fact[:500], url, fact_hash))
|
||||
facts_inserted += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Find more links to follow
|
||||
links = find_links(html, url)
|
||||
for link in links[:30]:
|
||||
if link not in visited and link not in queue:
|
||||
queue.append(link)
|
||||
|
||||
log.info(f"FINAL: visited={len(visited)} docs={docs_inserted} facts={facts_inserted}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
# Federation deep scrape — HNS, HPS, HRS
|
||||
import sys
|
||||
sys.path.insert(0, '/opt/pgz-sport/scrapers')
|
||||
from gov_hr_sport_scraper import fetch, extract_text, find_links
|
||||
from urllib.parse import urljoin, urlparse
|
||||
import time, re, hashlib, json, psycopg2
|
||||
from html import unescape
|
||||
import logging
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [fed] %(message)s')
|
||||
log = logging.getLogger("fed")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
ROOTS = {
|
||||
"HNS": ["https://hns-cff.hr", "https://www.nspgz.hr"],
|
||||
"HPS": ["https://www.hps.hr"],
|
||||
"HRS": ["https://www.hrs.hr"],
|
||||
"HOK": ["https://www.hok.hr"], # Hrvatski olimpijski komitet
|
||||
"HKS": ["https://www.hks.hr"], # Hrvatski karatraski savez
|
||||
}
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
total_docs = total_facts = 0
|
||||
|
||||
for fed, roots in ROOTS.items():
|
||||
log.info(f"=== {fed} deep ===")
|
||||
visited = set(); queue = list(roots)
|
||||
while queue and len(visited) < 80:
|
||||
url = queue.pop(0)
|
||||
if url in visited: continue
|
||||
visited.add(url)
|
||||
time.sleep(2)
|
||||
html, status = fetch(url)
|
||||
if not html or status != 200: continue
|
||||
log.info(f" [{status}] {url[:80]}")
|
||||
text = extract_text(html)
|
||||
if len(text) < 200: continue
|
||||
|
||||
title_m = re.search(r'<title[^>]*>([^<]+)</title>', html, re.I)
|
||||
title = title_m.group(1).strip() if title_m else url[:80]
|
||||
sha1 = hashlib.sha1(text[:5000].encode()).hexdigest()
|
||||
try:
|
||||
cur.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(url, fname, title, sadrzaj, vrsta, izvor_url, scraped_at, sha1, organizacija)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,now(),%s,%s) ON CONFLICT DO NOTHING""",
|
||||
(url, url.split('/')[-1][:100], title[:200], text[:50000], 'web', url, sha1, fed))
|
||||
total_docs += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Facts ako ima sport-relevant
|
||||
if any(kw in text.lower() for kw in ['klub', 'sportaš', 'natjecanj', 'liga', 'kup', 'prvenstvo']):
|
||||
chunks = [text[i:i+800] for i in range(0, min(len(text), 3000), 800)]
|
||||
for ci, chunk in enumerate(chunks[:3]):
|
||||
if len(chunk) < 200: continue
|
||||
fh = hashlib.sha256((url+str(ci)+chunk[:80]).encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s,%s,'fed_deep_scraper',%s::jsonb,0.80,%s,now())
|
||||
ON CONFLICT (data_hash) DO NOTHING""",
|
||||
(chunk[:1500], f'fed_{fed.lower()}', json.dumps([{"url":url}]), fh))
|
||||
total_facts += cur.rowcount
|
||||
except: pass
|
||||
|
||||
# Follow internal links
|
||||
host = urlparse(url).hostname
|
||||
for m in re.finditer(r'href=["\']([^"\']+)["\']', html, re.I):
|
||||
u = urljoin(url, m.group(1))
|
||||
if urlparse(u).hostname == host and u not in visited and u not in queue:
|
||||
queue.append(u)
|
||||
if len(queue) > 100: break
|
||||
|
||||
log.info(f"TOTAL: docs={total_docs} facts={total_facts}")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Executable
+11
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
# Master loop — PGŽ sport intensive learning every 15min
|
||||
while true; do
|
||||
echo "[$(date)] === LOOP START ==="
|
||||
python3 /opt/pgz-sport/scrapers/klub_oib_enricher.py 2>&1 | tail -3
|
||||
python3 /opt/pgz-sport/scrapers/clan_oib_enricher.py 2>&1 | tail -3
|
||||
python3 /opt/pgz-sport/scrapers/sport_to_knowledge.py 2>&1 | tail -5
|
||||
python3 /opt/pgz-sport/scrapers/qa_from_sport_facts.py 2>&1 | tail -3
|
||||
echo "[$(date)] === LOOP END, sleep 900s ==="
|
||||
sleep 900
|
||||
done
|
||||
@@ -0,0 +1,163 @@
|
||||
#!/usr/bin/env python3
|
||||
# sport_to_knowledge.py v2.0 — match dabi.knowledge schema (source_refs jsonb)
|
||||
import os, sys, hashlib, logging, json
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sport2k] %(message)s')
|
||||
log = logging.getLogger("sport2k")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def insert_batch(cur, facts):
|
||||
if not facts: return 0
|
||||
execute_batch(cur, """
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s, %s, %s, %s::jsonb, %s, %s, now())
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", facts, page_size=200)
|
||||
return len(facts)
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
total = 0
|
||||
|
||||
# 1) Klubovi
|
||||
cur.execute("""
|
||||
SELECT id, naziv, oib, sport, grad, predsjednik, tajnik, trener_glavni,
|
||||
broj_clanova, broj_aktivnih_sportasa, godina_osnutka, web, telefon, email, adresa
|
||||
FROM pgz_sport.klubovi WHERE aktivan = true
|
||||
""")
|
||||
rows = []
|
||||
for k in cur.fetchall():
|
||||
kid, naziv, oib, sport, grad, preds, tajn, tren, n_cl, n_akt, god, web, tel, email, adr = k
|
||||
if not naziv: continue
|
||||
parts = [f"Klub {naziv}"]
|
||||
if sport: parts.append(f"sport: {sport}")
|
||||
if grad: parts.append(f"grad: {grad}")
|
||||
if oib: parts.append(f"OIB: {oib}")
|
||||
if god: parts.append(f"osnovan {god}.")
|
||||
if preds: parts.append(f"predsjednik: {preds}")
|
||||
if tajn: parts.append(f"tajnik: {tajn}")
|
||||
if tren: parts.append(f"glavni trener: {tren}")
|
||||
if n_cl: parts.append(f"broj članova: {n_cl}")
|
||||
if n_akt: parts.append(f"broj aktivnih sportaša: {n_akt}")
|
||||
if adr: parts.append(f"adresa: {adr}")
|
||||
if tel: parts.append(f"tel: {tel}")
|
||||
if email: parts.append(f"email: {email}")
|
||||
if web: parts.append(f"web: {web}")
|
||||
fact = ". ".join(parts) + "."
|
||||
if len(fact) < 30: continue
|
||||
fact_hash = hashlib.sha256(f"klub:{kid}:{fact[:200]}".encode()).hexdigest()[:32]
|
||||
refs = json.dumps([{"type":"pgz_sport_klub","id": kid, "url": f"https://sport.rinet.one/admin#klub/{kid}"}])
|
||||
rows.append((fact[:2000], 'pgz_sport_klub', 'pgz_sport_db_extract', refs, 0.92, fact_hash))
|
||||
n = insert_batch(cur, rows); total += n; log.info(f"Klubovi facts: {n}")
|
||||
|
||||
# 2) Savezi
|
||||
cur.execute("""
|
||||
SELECT id, naziv, oib, sport, predsjednik, tajnik, web, NULL AS broj_klubova, NULL AS broj_clanova
|
||||
FROM pgz_sport.savezi
|
||||
""")
|
||||
rows = []
|
||||
for s in cur.fetchall():
|
||||
sid, naziv, oib, sport, preds, tajn, web, n_kl, n_cl = s
|
||||
if not naziv: continue
|
||||
parts = [f"Savez {naziv}"]
|
||||
if sport: parts.append(f"sport: {sport}")
|
||||
if oib: parts.append(f"OIB: {oib}")
|
||||
if preds: parts.append(f"predsjednik: {preds}")
|
||||
if tajn: parts.append(f"tajnik: {tajn}")
|
||||
if n_kl: parts.append(f"broj klubova: {n_kl}")
|
||||
if n_cl: parts.append(f"broj članova: {n_cl}")
|
||||
if web: parts.append(f"web: {web}")
|
||||
fact = ". ".join(parts) + "."
|
||||
if len(fact) < 30: continue
|
||||
fact_hash = hashlib.sha256(f"savez:{sid}:{fact[:200]}".encode()).hexdigest()[:32]
|
||||
refs = json.dumps([{"type":"pgz_sport_savez","id": sid}])
|
||||
rows.append((fact[:2000], 'pgz_sport_savez', 'pgz_sport_db_extract', refs, 0.92, fact_hash))
|
||||
n = insert_batch(cur, rows); total += n; log.info(f"Savezi facts: {n}")
|
||||
|
||||
# 3) Overview
|
||||
cur.execute("""
|
||||
SELECT s.naziv, s.sport, count(k.id) AS n_kl,
|
||||
string_agg(k.grad, ', ' ORDER BY k.grad) FILTER (WHERE k.grad IS NOT NULL) AS gradovi
|
||||
FROM pgz_sport.savezi s
|
||||
LEFT JOIN pgz_sport.klubovi k ON k.savez_id = s.id AND k.aktivan = true
|
||||
GROUP BY s.id, s.naziv, s.sport HAVING count(k.id) > 0
|
||||
""")
|
||||
rows = []
|
||||
for r in cur.fetchall():
|
||||
savez, sport, n, gradovi = r
|
||||
gradovi_str = (gradovi[:300] + '...') if gradovi and len(gradovi) > 300 else (gradovi or '')
|
||||
fact = f"{savez} ima {n} aktivnih klubova"
|
||||
if sport: fact += f" u sportu {sport}"
|
||||
if gradovi_str: fact += f". Gradovi: {gradovi_str[:200]}"
|
||||
fact += "."
|
||||
fact_hash = hashlib.sha256(f"overview:{savez}".encode()).hexdigest()[:32]
|
||||
rows.append((fact[:2000], 'pgz_sport_overview', 'pgz_sport_db_extract', json.dumps([{}]), 0.95, fact_hash))
|
||||
n = insert_batch(cur, rows); total += n; log.info(f"Overview facts: {n}")
|
||||
|
||||
# 4) Multi-chair
|
||||
cur.execute("""
|
||||
WITH all_links AS (
|
||||
SELECT lower(trim(predsjednik)) AS pk, predsjednik AS pname,
|
||||
'klub:'||k.id AS oid, k.naziv AS oname, 'predsjednik' AS role
|
||||
FROM pgz_sport.klubovi k WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
|
||||
UNION ALL
|
||||
SELECT lower(trim(tajnik)), tajnik, 'klub:'||k.id, k.naziv, 'tajnik'
|
||||
FROM pgz_sport.klubovi k WHERE tajnik IS NOT NULL AND length(trim(tajnik)) > 5
|
||||
UNION ALL
|
||||
SELECT lower(trim(predsjednik)), predsjednik, 'savez:'||s.id, s.naziv, 'predsjednik'
|
||||
FROM pgz_sport.savezi s WHERE predsjednik IS NOT NULL AND length(trim(predsjednik)) > 5
|
||||
)
|
||||
SELECT pk, max(pname) AS pname, count(DISTINCT oid) AS n_orgs,
|
||||
string_agg(DISTINCT oname || ' (' || role || ')', '; ') AS orgs
|
||||
FROM all_links GROUP BY pk HAVING count(DISTINCT oid) >= 2
|
||||
ORDER BY count(DISTINCT oid) DESC LIMIT 200
|
||||
""")
|
||||
rows = []
|
||||
for r in cur.fetchall():
|
||||
pk, pname, n_orgs, orgs = r
|
||||
fact = f"{pname} sjedi na {n_orgs} stolica u PGŽ Sport ekosustavu: {orgs[:500]}"
|
||||
if n_orgs >= 3:
|
||||
fact += " — VIŠESTRUKE FUNKCIJE: forenzički flag za moguće sukobe interesa."
|
||||
fact_hash = hashlib.sha256(f"multichair:{pk}".encode()).hexdigest()[:32]
|
||||
rows.append((fact[:2000], 'pgz_sport_multichair', 'pgz_sport_db_extract', json.dumps([{}]), 0.90, fact_hash))
|
||||
n = insert_batch(cur, rows); total += n; log.info(f"Multi-chair facts: {n}")
|
||||
|
||||
# 5) Manifestacije + natjecanja
|
||||
cur.execute("SELECT id, naziv, mjesto, godina_od, organizator, razina, broj_ucesnika FROM pgz_sport.manifestacije WHERE naziv IS NOT NULL")
|
||||
rows = []
|
||||
for r in cur.fetchall():
|
||||
mid, naziv, mjesto, god, org, razina, n_uces = r
|
||||
fact = f"Sportska manifestacija: {naziv}"
|
||||
if mjesto: fact += f", mjesto: {mjesto}"
|
||||
if god: fact += f", godina {god}"
|
||||
if org: fact += f", organizator: {org}"
|
||||
if razina: fact += f", razina: {razina}"
|
||||
if n_uces: fact += f", broj učesnika: {n_uces}"
|
||||
fact += "."
|
||||
fh = hashlib.sha256(f"man:{mid}".encode()).hexdigest()[:32]
|
||||
rows.append((fact[:2000], 'pgz_sport_manifestacija', 'pgz_sport_db_extract', json.dumps([{}]), 0.85, fh))
|
||||
|
||||
cur.execute("SELECT id, naziv, sport, datum_pocetka::text AS godina, sezona, razina, tip, kategorija FROM pgz_sport.natjecanja WHERE naziv IS NOT NULL LIMIT 500")
|
||||
for r in cur.fetchall():
|
||||
nid, naziv, sport, god, sez, raz, tip, kat = r
|
||||
fact = f"Natjecanje: {naziv}"
|
||||
if sport: fact += f" — sport: {sport}"
|
||||
if sez: fact += f", sezona {sez}"
|
||||
if raz: fact += f", razina: {raz}"
|
||||
if tip: fact += f", tip: {tip}"
|
||||
if kat: fact += f", kategorija: {kat}"
|
||||
if god and god != 'None': fact += f", datum početka: {god[:10]}"
|
||||
fact += "."
|
||||
fh = hashlib.sha256(f"nat:{nid}".encode()).hexdigest()[:32]
|
||||
rows.append((fact[:2000], 'pgz_sport_natjecanje', 'pgz_sport_db_extract', json.dumps([{}]), 0.85, fh))
|
||||
n = insert_batch(cur, rows); total += n; log.info(f"Manifest+nat: {n}")
|
||||
|
||||
log.info(f"═══ TOTAL: {total} ═══")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,50 @@
|
||||
#!/usr/bin/env python3
|
||||
# Sudreg lookup po klubu nazivu za one BEZ OIB
|
||||
import psycopg2, time, hashlib, json, re
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN, cursor_factory=RealDictCursor)
|
||||
conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Klubovi bez OIB sa imenom dovoljnim za search
|
||||
cur.execute("""
|
||||
SELECT id, naziv, grad FROM pgz_sport.klubovi
|
||||
WHERE (oib IS NULL OR length(oib) != 11) AND naziv IS NOT NULL AND length(naziv) > 8
|
||||
ORDER BY id
|
||||
""")
|
||||
klubovi = cur.fetchall()
|
||||
print(f"Klubovi za Sudreg lookup: {len(klubovi)}")
|
||||
|
||||
# Try fuzzy match s civic.sudreg_api_cache koji već imamo
|
||||
found = 0
|
||||
for k in klubovi:
|
||||
# Drop common suffixes
|
||||
clean = re.sub(r'\b(klub|udruga|sportski|sportsko)\b', '', k['naziv'], flags=re.I).strip()
|
||||
if len(clean) < 5: continue
|
||||
|
||||
# Try direct search civic.sudreg_api_cache
|
||||
cur.execute("""
|
||||
SELECT data->>'oib' AS oib, data->>'tvrtka' AS tvrtka, data->>'naziv' AS naziv
|
||||
FROM civic.sudreg_api_cache
|
||||
WHERE (data->>'tvrtka' ILIKE %s OR data->>'naziv' ILIKE %s)
|
||||
AND data->>'oib' IS NOT NULL
|
||||
LIMIT 3
|
||||
""", (f'%{clean[:30]}%', f'%{clean[:30]}%'))
|
||||
cands = cur.fetchall()
|
||||
|
||||
if len(cands) == 1 and cands[0]['oib']:
|
||||
cur.execute("UPDATE pgz_sport.klubovi SET oib=%s WHERE id=%s AND oib IS NULL",
|
||||
(cands[0]['oib'], k['id']))
|
||||
found += 1
|
||||
if found % 20 == 0:
|
||||
print(f"Found: {found}")
|
||||
|
||||
print(f"FINAL: {found} OIB-ova nadeno preko sudreg_api_cache")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env python3
|
||||
# sukob_sport_scraper.py — Povjerenstvo za sukob interesa, filter za sport funkcionere
|
||||
import os, time, hashlib, logging, re, json
|
||||
from urllib.parse import urljoin
|
||||
import urllib.request
|
||||
import psycopg2
|
||||
from html import unescape
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [sukob] %(message)s')
|
||||
log = logging.getLogger("sukob")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Mozilla/5.0 (Ri.NET Bot 1.0)"
|
||||
|
||||
def fetch(url):
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return r.read().decode('utf-8', errors='replace'), r.status
|
||||
except: return None, 0
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get OIB-ovi predsjednika i tajnika klubova/saveza za cross-check
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ime FROM (
|
||||
SELECT predsjednik AS ime FROM pgz_sport.klubovi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
||||
UNION
|
||||
SELECT tajnik FROM pgz_sport.klubovi WHERE tajnik IS NOT NULL AND length(tajnik)>5
|
||||
UNION
|
||||
SELECT predsjednik FROM pgz_sport.savezi WHERE predsjednik IS NOT NULL AND length(predsjednik)>5
|
||||
) t LIMIT 100
|
||||
""")
|
||||
sport_imena = [r[0].strip() for r in cur.fetchall() if r[0]]
|
||||
log.info(f"Sport imena za cross-check: {len(sport_imena)}")
|
||||
|
||||
# Search sukobinteresa.hr za neka imena
|
||||
facts = 0
|
||||
for ime in sport_imena[:20]:
|
||||
# Pretraga po imenu
|
||||
from urllib.parse import quote_plus
|
||||
url = f"https://www.sukobinteresa.hr/hr/imovinsko-stanje/imovinske-kartice?search={quote_plus(ime)}"
|
||||
html, status = fetch(url)
|
||||
time.sleep(2)
|
||||
if not html or status != 200: continue
|
||||
|
||||
# Ako ima rezultata, izvuci
|
||||
if ime.lower() in html.lower():
|
||||
# cleanup html
|
||||
text = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.S|re.I)
|
||||
text = re.sub(r'<[^>]+>', ' ', text)
|
||||
text = re.sub(r'\s+', ' ', unescape(text)).strip()
|
||||
# Find context around ime
|
||||
idx = text.lower().find(ime.lower())
|
||||
if idx > 0:
|
||||
ctx = text[max(0, idx-300):idx+500]
|
||||
fact = f"FORENSIČKI FLAG: {ime} se nalazi u registru imovinskih kartica Povjerenstva za sukob interesa. Kontekst: {ctx[:600]}"
|
||||
fh = hashlib.sha256(f"sukob:{ime}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s,'sukob_interesa_sport','sukob_scraper',%s::jsonb,0.90,%s,now())
|
||||
ON CONFLICT (data_hash) DO NOTHING""",
|
||||
(fact[:2000], json.dumps([{"url":url, "ime":ime}]), fh))
|
||||
facts += cur.rowcount
|
||||
log.info(f"✓ Match: {ime}")
|
||||
except: pass
|
||||
|
||||
log.info(f"FINAL: {facts} sukob facts")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
@@ -0,0 +1,176 @@
|
||||
#!/usr/bin/env python3
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
# wiki_hr_scraper.py | v1.0.0 | 04.05.2026
|
||||
# Svrha: Hrvatska Wikipedia — extract relevant pages za HR knowledge
|
||||
# Strategy: API search po HR-relevant kategorijama + fetch top results
|
||||
# ═══════════════════════════════════════════════════════════════════
|
||||
"""Hrvatska Wikipedia scraper (preko API)."""
|
||||
import os, time, hashlib, logging, re, json
|
||||
import urllib.request, urllib.parse
|
||||
import psycopg2
|
||||
import sys
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [wiki_hr] %(message)s')
|
||||
log = logging.getLogger("wiki_hr")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UA = "Ri.NET Bot 1.0 (contact: dradulic@outlook.com)"
|
||||
API = "https://hr.wikipedia.org/w/api.php"
|
||||
|
||||
# Kategorije — širok HR knowledge bazu
|
||||
CATEGORIES = [
|
||||
"Hrvatski_gradovi",
|
||||
"Hrvatske_općine",
|
||||
"Hrvatski_otoci",
|
||||
"Hrvatske_planine",
|
||||
"Hrvatske_rijeke",
|
||||
"Primorsko-goranska_županija",
|
||||
"Naselja_u_Primorsko-goranskoj_županiji",
|
||||
"Hrvatski_političari",
|
||||
"Hrvatski_sportaši",
|
||||
"Hrvatski_glazbenici",
|
||||
"Hrvatski_pisci",
|
||||
"Hrvatski_glumci",
|
||||
"Hrvatska_povijest",
|
||||
"Hrvatska_arhitektura",
|
||||
"Hrvatska_kuhinja",
|
||||
"Hrvatska_kultura",
|
||||
"Hrvatska_znanost",
|
||||
"Domovinski_rat",
|
||||
"Hrvatska_ekonomija",
|
||||
"Hrvatski_klubovi",
|
||||
"Hrvatski_nogometni_klubovi",
|
||||
"Hrvatski_košarkaški_klubovi",
|
||||
"Hrvatske_političke_stranke",
|
||||
"Predsjednici_Hrvatske",
|
||||
"Premijeri_Hrvatske",
|
||||
"Rijeka",
|
||||
"Kvarner",
|
||||
"Krk",
|
||||
"Cres",
|
||||
"Lošinj",
|
||||
"Rab",
|
||||
"Pag",
|
||||
"Učka",
|
||||
"Risnjak",
|
||||
]
|
||||
|
||||
def api_get(params):
|
||||
"""Wikipedia API GET."""
|
||||
p = dict(params)
|
||||
p['format'] = 'json'
|
||||
p['utf8'] = '1'
|
||||
url = API + '?' + urllib.parse.urlencode(p)
|
||||
try:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
||||
with urllib.request.urlopen(req, timeout=20) as r:
|
||||
return json.loads(r.read().decode('utf-8'))
|
||||
except Exception as e:
|
||||
log.warning(f"API fail: {e}")
|
||||
return {}
|
||||
|
||||
def category_members(cat, limit=500):
|
||||
"""Get all pages in category."""
|
||||
pages = []
|
||||
cont = ''
|
||||
while True:
|
||||
params = {
|
||||
'action': 'query',
|
||||
'list': 'categorymembers',
|
||||
'cmtitle': f'Kategorija:{cat}',
|
||||
'cmlimit': '500',
|
||||
'cmtype': 'page'
|
||||
}
|
||||
if cont: params['cmcontinue'] = cont
|
||||
d = api_get(params)
|
||||
if not d.get('query'): break
|
||||
for m in d['query'].get('categorymembers', []):
|
||||
pages.append(m['title'])
|
||||
if len(pages) >= limit: return pages
|
||||
cont = d.get('continue', {}).get('cmcontinue')
|
||||
if not cont: break
|
||||
time.sleep(0.5)
|
||||
return pages
|
||||
|
||||
def fetch_page_extract(title):
|
||||
"""Get plain text extract of a page."""
|
||||
params = {
|
||||
'action': 'query',
|
||||
'prop': 'extracts|info',
|
||||
'exintro': '0',
|
||||
'explaintext': '1',
|
||||
'inprop': 'url',
|
||||
'titles': title,
|
||||
'exsectionformat': 'plain',
|
||||
'exlimit': '1',
|
||||
}
|
||||
d = api_get(params)
|
||||
if not d.get('query'): return None, None
|
||||
pages = d['query'].get('pages', {})
|
||||
for pid, p in pages.items():
|
||||
if pid == '-1': continue
|
||||
return p.get('extract', ''), p.get('fullurl', '')
|
||||
return None, None
|
||||
|
||||
def harvest():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
total_facts = 0
|
||||
total_pages = 0
|
||||
|
||||
for cat in CATEGORIES:
|
||||
log.info(f"=== Kategorija: {cat} ===")
|
||||
pages = category_members(cat, limit=200)
|
||||
log.info(f" pages: {len(pages)}")
|
||||
|
||||
for title in pages:
|
||||
try:
|
||||
# Skip ako već imamo
|
||||
cur.execute("SELECT 1 FROM dabi.knowledge WHERE source = 'wiki_hr' AND fact LIKE %s LIMIT 1",
|
||||
(f'{title[:50]}%',))
|
||||
if cur.fetchone():
|
||||
continue
|
||||
|
||||
extract, url = fetch_page_extract(title)
|
||||
time.sleep(0.5)
|
||||
if not extract or len(extract) < 200: continue
|
||||
|
||||
# Razdvoji na chunks (svaki chunk = jedan fact)
|
||||
# Prvi chunk je intro (najvažniji)
|
||||
chunks = []
|
||||
first_chunk = extract[:1500]
|
||||
chunks.append((title + " — " + first_chunk, 0.92))
|
||||
|
||||
# Sljedeći chunks (manje confidence)
|
||||
for i in range(1500, min(len(extract), 6000), 1500):
|
||||
chunks.append((title + " — " + extract[i:i+1500], 0.85))
|
||||
|
||||
for chunk_text, conf in chunks:
|
||||
fh = hashlib.sha256(f"wiki:{title}:{chunk_text[:80]}".encode()).hexdigest()[:32]
|
||||
refs = json.dumps([{"url": url, "title": title, "wikipedia": "hr"}])
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.knowledge
|
||||
(fact, category, source, source_refs, confidence, data_hash, created_at)
|
||||
VALUES (%s, %s, 'wiki_hr', %s::jsonb, %s, %s, now())
|
||||
ON CONFLICT (data_hash) DO NOTHING
|
||||
""", (chunk_text[:2000], f'wiki_{cat[:30]}', refs, conf, fh))
|
||||
total_facts += cur.rowcount
|
||||
except Exception as e:
|
||||
if total_facts < 5: log.warning(f"insert: {e}")
|
||||
|
||||
total_pages += 1
|
||||
if total_pages % 20 == 0:
|
||||
log.info(f" Progress: {total_pages} pages, {total_facts} facts")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f" Page fail '{title}': {e}")
|
||||
continue
|
||||
|
||||
log.info(f" Done {cat}: total facts={total_facts}")
|
||||
|
||||
log.info(f"═══ FINAL: {total_pages} pages, {total_facts} facts ═══")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
harvest()
|
||||
Executable
+110
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
|
||||
import psycopg2, requests, re, json, time, urllib.parse
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
TIMEOUT = 20
|
||||
DELAY = 0.5
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
|
||||
def query_wiki(name, lang="hr"):
|
||||
"""Search + page details w/ pageimages."""
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json","list":"search",
|
||||
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
|
||||
sr = r.json().get("query",{}).get("search",[])
|
||||
if not sr: return None
|
||||
candidates = [x["title"] for x in sr]
|
||||
except: return None
|
||||
|
||||
# Pick first candidate that contains key word from name
|
||||
key = name.split()[-1].lower()
|
||||
for title in candidates:
|
||||
if key not in title.lower(): continue
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json",
|
||||
"prop":"extracts|pageimages|info",
|
||||
"exintro":1,"explaintext":1,
|
||||
"piprop":"original|thumbnail","pithumbsize":500,
|
||||
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
|
||||
pages = r.json().get("query",{}).get("pages",{})
|
||||
for pid, p in pages.items():
|
||||
if pid == "-1": continue
|
||||
extract = p.get("extract","")
|
||||
if not extract: continue
|
||||
# Sport context check
|
||||
el = extract.lower()
|
||||
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
|
||||
continue
|
||||
logo = (p.get("thumbnail",{}).get("source") or
|
||||
p.get("original",{}).get("source"))
|
||||
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
|
||||
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
|
||||
except: continue
|
||||
return None
|
||||
|
||||
def enrich_klub(naziv):
|
||||
# Try variants
|
||||
variants = [naziv]
|
||||
# Strip common prefixes
|
||||
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
|
||||
if base != naziv: variants.append(base)
|
||||
if "Rijeka" not in naziv and base != naziv:
|
||||
variants.append(f"{base} Rijeka")
|
||||
|
||||
for v in variants:
|
||||
for lang in ["hr","en"]:
|
||||
r = query_wiki(v, lang)
|
||||
if r: return r
|
||||
return None
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Top klubovi: most trofeji + svjetski medalisti, missing logo
|
||||
cr.execute("""
|
||||
WITH top_klubovi AS (
|
||||
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
|
||||
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
|
||||
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
|
||||
FROM pgz_sport.klubovi k
|
||||
WHERE k.id != 4426 AND k.aktivan=true
|
||||
)
|
||||
SELECT id, naziv FROM top_klubovi
|
||||
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
|
||||
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
|
||||
""")
|
||||
todo = cr.fetchall()
|
||||
print(f"Klubovi to enrich (logo): {len(todo)}")
|
||||
|
||||
success = 0
|
||||
for kid, naziv in todo:
|
||||
print(f" → {naziv}", end="", flush=True)
|
||||
r = enrich_klub(naziv)
|
||||
if not r:
|
||||
print(" MISS"); continue
|
||||
sets, vals = [], []
|
||||
if r.get("logo"):
|
||||
sets.append("logo_url = %s"); vals.append(r["logo"])
|
||||
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
|
||||
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(kid)
|
||||
try:
|
||||
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
||||
success += 1
|
||||
flags = " +LOGO" if r.get("logo") else ""
|
||||
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
|
||||
except Exception as e:
|
||||
print(f" DBerr: {e}")
|
||||
|
||||
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
|
||||
conn.close()
|
||||
+110
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.3 — Wiki + Wikidata logo enrichment for top klubovi."""
|
||||
import psycopg2, requests, re, json, time, urllib.parse
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
TIMEOUT = 20
|
||||
DELAY = 0.5
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
|
||||
def query_wiki(name, lang="hr"):
|
||||
"""Search + page details w/ pageimages."""
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json","list":"search",
|
||||
"srsearch":name,"srlimit":3,"utf8":1}, timeout=TIMEOUT)
|
||||
sr = r.json().get("query",{}).get("search",[])
|
||||
if not sr: return None
|
||||
candidates = [x["title"] for x in sr]
|
||||
except: return None
|
||||
|
||||
# Pick first candidate that contains key word from name
|
||||
key = name.split()[-1].lower()
|
||||
for title in candidates:
|
||||
if key not in title.lower(): continue
|
||||
time.sleep(DELAY)
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json",
|
||||
"prop":"extracts|pageimages|info",
|
||||
"exintro":1,"explaintext":1,
|
||||
"piprop":"original|thumbnail","pithumbsize":500,
|
||||
"inprop":"url","titles":title,"utf8":1}, timeout=TIMEOUT)
|
||||
pages = r.json().get("query",{}).get("pages",{})
|
||||
for pid, p in pages.items():
|
||||
if pid == "-1": continue
|
||||
extract = p.get("extract","")
|
||||
if not extract: continue
|
||||
# Sport context check
|
||||
el = extract.lower()
|
||||
if not any(k in el for k in ["klub","sport","liga","prvenstv","football","basketball","handball","water polo","volleyball","cycling","sailing","klub","tim","club"]):
|
||||
continue
|
||||
logo = (p.get("thumbnail",{}).get("source") or
|
||||
p.get("original",{}).get("source"))
|
||||
page_url = p.get("fullurl") or f"https://{lang}.wikipedia.org/wiki/{urllib.parse.quote(title.replace(' ','_'))}"
|
||||
return {"title":title, "logo":logo, "biografija":extract[:1500], "source_url":page_url, "lang":lang}
|
||||
except: continue
|
||||
return None
|
||||
|
||||
def enrich_klub(naziv):
|
||||
# Try variants
|
||||
variants = [naziv]
|
||||
# Strip common prefixes
|
||||
base = re.sub(r'^(HNK|NK|RK|KK|VK|HK|AK|TK|BK|PK|HAOK|HŠK)\s+', '', naziv).strip()
|
||||
if base != naziv: variants.append(base)
|
||||
if "Rijeka" not in naziv and base != naziv:
|
||||
variants.append(f"{base} Rijeka")
|
||||
|
||||
for v in variants:
|
||||
for lang in ["hr","en"]:
|
||||
r = query_wiki(v, lang)
|
||||
if r: return r
|
||||
return None
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
# Top klubovi: most trofeji + svjetski medalisti, missing logo
|
||||
cr.execute("""
|
||||
WITH top_klubovi AS (
|
||||
SELECT k.id, k.naziv, k.logo_url, k.napomena, k.web,
|
||||
(SELECT count(*) FROM pgz_sport.klub_sezona WHERE klub_id=k.id) AS trofeja,
|
||||
(SELECT count(*) FROM pgz_sport.clan_nagrada WHERE klub_id=k.id) AS nagrada
|
||||
FROM pgz_sport.klubovi k
|
||||
WHERE k.id != 4426 AND k.aktivan=true
|
||||
)
|
||||
SELECT id, naziv FROM top_klubovi
|
||||
WHERE logo_url IS NULL AND (trofeja > 0 OR nagrada > 0)
|
||||
ORDER BY trofeja DESC, nagrada DESC LIMIT 50
|
||||
""")
|
||||
todo = cr.fetchall()
|
||||
print(f"Klubovi to enrich (logo): {len(todo)}")
|
||||
|
||||
success = 0
|
||||
for kid, naziv in todo:
|
||||
print(f" → {naziv}", end="", flush=True)
|
||||
r = enrich_klub(naziv)
|
||||
if not r:
|
||||
print(" MISS"); continue
|
||||
sets, vals = [], []
|
||||
if r.get("logo"):
|
||||
sets.append("logo_url = %s"); vals.append(r["logo"])
|
||||
sets.append("napomena = COALESCE(napomena, %s)"); vals.append(r["biografija"][:1000])
|
||||
sets.append("web = COALESCE(web, %s)"); vals.append(r["source_url"])
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(kid)
|
||||
try:
|
||||
cr.execute(f"UPDATE pgz_sport.klubovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
||||
success += 1
|
||||
flags = " +LOGO" if r.get("logo") else ""
|
||||
print(f" ✓ [{r['lang']}] {r['title']}{flags}")
|
||||
except Exception as e:
|
||||
print(f" DBerr: {e}")
|
||||
|
||||
print(f"\n=== Klubovi enriched: {success}/{len(todo)} ===")
|
||||
conn.close()
|
||||
Executable
+132
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
|
||||
import psycopg2, requests, re, json, time, urllib.parse
|
||||
|
||||
DB = dict(host="10.10.0.2", port=6432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
TIMEOUT = 20
|
||||
DELAY = 0.4
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
|
||||
def get_wikidata_id(title, lang):
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json","prop":"pageprops",
|
||||
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
|
||||
pages = r.json().get("query",{}).get("pages",{})
|
||||
for pid, p in pages.items():
|
||||
if pid == "-1": continue
|
||||
qid = p.get("pageprops",{}).get("wikibase_item")
|
||||
if qid: return qid
|
||||
except: pass
|
||||
return None
|
||||
|
||||
def get_wikidata_entity(qid):
|
||||
try:
|
||||
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
||||
return r.json().get("entities",{}).get(qid)
|
||||
except: return None
|
||||
|
||||
def get_label(qid, lang="hr"):
|
||||
try:
|
||||
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
||||
ent = r.json().get("entities",{}).get(qid,{})
|
||||
labels = ent.get("labels",{})
|
||||
return (labels.get(lang,{}).get("value") or
|
||||
labels.get("en",{}).get("value") or
|
||||
labels.get("hr",{}).get("value"))
|
||||
except: return None
|
||||
|
||||
def parse_birth(entity):
|
||||
out = {}
|
||||
if not entity: return out
|
||||
claims = entity.get("claims",{})
|
||||
for prop in ["P569"]:
|
||||
for c in claims.get(prop,[]):
|
||||
try:
|
||||
t = c["mainsnak"]["datavalue"]["value"]["time"]
|
||||
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
|
||||
if m:
|
||||
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
|
||||
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
|
||||
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
|
||||
break
|
||||
except: continue
|
||||
for prop in ["P19"]:
|
||||
for c in claims.get(prop,[]):
|
||||
try:
|
||||
qid = c["mainsnak"]["datavalue"]["value"]["id"]
|
||||
lbl = get_label(qid,"hr")
|
||||
time.sleep(DELAY)
|
||||
if lbl:
|
||||
out["mjesto_rodenja"] = lbl[:100]
|
||||
break
|
||||
except: continue
|
||||
return out
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
cr.execute("""
|
||||
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE source_url LIKE '%wikipedia.org/wiki/%'
|
||||
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
|
||||
""")
|
||||
todo = cr.fetchall()
|
||||
print(f"Pending: {len(todo)}")
|
||||
|
||||
success = 0
|
||||
for cid, ime, prezime, source_url, dob, mjesto in todo:
|
||||
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
|
||||
if not m: continue
|
||||
lang = m.group(1)
|
||||
raw = m.group(2)
|
||||
title = urllib.parse.unquote(raw).replace("_"," ")
|
||||
|
||||
qid = get_wikidata_id(title, lang)
|
||||
time.sleep(DELAY)
|
||||
if not qid:
|
||||
# Try alternate lang
|
||||
alt = "en" if lang == "hr" else "hr"
|
||||
qid = get_wikidata_id(title, alt)
|
||||
time.sleep(DELAY)
|
||||
if not qid:
|
||||
print(f" ✗ {ime} {prezime}: no Q-id"); continue
|
||||
|
||||
entity = get_wikidata_entity(qid)
|
||||
time.sleep(DELAY)
|
||||
parsed = parse_birth(entity)
|
||||
if not parsed:
|
||||
print(f" ✗ {ime} {prezime} ({qid}): no birth data"); continue
|
||||
|
||||
sets, vals = [], []
|
||||
if parsed.get("datum_rodenja") and not dob:
|
||||
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
|
||||
if parsed.get("mjesto_rodenja") and not mjesto:
|
||||
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
|
||||
if sets:
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(cid)
|
||||
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
||||
success += 1
|
||||
flags = []
|
||||
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
|
||||
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
|
||||
print(f" ✓ {ime} {prezime} ({qid}): {' '.join(flags)}")
|
||||
|
||||
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
|
||||
|
||||
# Re-stats
|
||||
cr.execute("""SELECT
|
||||
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
|
||||
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
|
||||
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
|
||||
FROM pgz_sport.clanovi""")
|
||||
r = cr.fetchone()
|
||||
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
|
||||
conn.close()
|
||||
+132
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python3
|
||||
"""D.2b — Better URL handling for HR Wikipedia + Wikidata."""
|
||||
import psycopg2, requests, re, json, time, urllib.parse
|
||||
|
||||
DB = dict(host="localhost", port=5432, dbname="rinet_v3",
|
||||
user="rinet", password="R1net2026!SecureDB#v7")
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
TIMEOUT = 20
|
||||
DELAY = 0.4
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA, "Accept-Language": "hr,en"})
|
||||
|
||||
def get_wikidata_id(title, lang):
|
||||
try:
|
||||
r = s.get(f"https://{lang}.wikipedia.org/w/api.php",
|
||||
params={"action":"query","format":"json","prop":"pageprops",
|
||||
"ppprop":"wikibase_item","titles":title}, timeout=TIMEOUT)
|
||||
pages = r.json().get("query",{}).get("pages",{})
|
||||
for pid, p in pages.items():
|
||||
if pid == "-1": continue
|
||||
qid = p.get("pageprops",{}).get("wikibase_item")
|
||||
if qid: return qid
|
||||
except: pass
|
||||
return None
|
||||
|
||||
def get_wikidata_entity(qid):
|
||||
try:
|
||||
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
||||
return r.json().get("entities",{}).get(qid)
|
||||
except: return None
|
||||
|
||||
def get_label(qid, lang="hr"):
|
||||
try:
|
||||
r = s.get(f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json", timeout=TIMEOUT)
|
||||
ent = r.json().get("entities",{}).get(qid,{})
|
||||
labels = ent.get("labels",{})
|
||||
return (labels.get(lang,{}).get("value") or
|
||||
labels.get("en",{}).get("value") or
|
||||
labels.get("hr",{}).get("value"))
|
||||
except: return None
|
||||
|
||||
def parse_birth(entity):
|
||||
out = {}
|
||||
if not entity: return out
|
||||
claims = entity.get("claims",{})
|
||||
for prop in ["P569"]:
|
||||
for c in claims.get(prop,[]):
|
||||
try:
|
||||
t = c["mainsnak"]["datavalue"]["value"]["time"]
|
||||
m = re.match(r"^[+-]?(\d{4})-(\d{2})-(\d{2})", t)
|
||||
if m:
|
||||
y,mo,d = int(m.group(1)),int(m.group(2)),int(m.group(3))
|
||||
if 1900 < y < 2026 and 1 <= mo <= 12 and 1 <= d <= 31:
|
||||
out["datum_rodenja"] = f"{y}-{mo:02d}-{d:02d}"
|
||||
break
|
||||
except: continue
|
||||
for prop in ["P19"]:
|
||||
for c in claims.get(prop,[]):
|
||||
try:
|
||||
qid = c["mainsnak"]["datavalue"]["value"]["id"]
|
||||
lbl = get_label(qid,"hr")
|
||||
time.sleep(DELAY)
|
||||
if lbl:
|
||||
out["mjesto_rodenja"] = lbl[:100]
|
||||
break
|
||||
except: continue
|
||||
return out
|
||||
|
||||
# === MAIN ===
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cr = conn.cursor()
|
||||
|
||||
cr.execute("""
|
||||
SELECT id, ime, prezime, source_url, datum_rodenja, mjesto_rodenja
|
||||
FROM pgz_sport.clanovi
|
||||
WHERE source_url LIKE '%wikipedia.org/wiki/%'
|
||||
AND (datum_rodenja IS NULL OR mjesto_rodenja IS NULL)
|
||||
""")
|
||||
todo = cr.fetchall()
|
||||
print(f"Pending: {len(todo)}")
|
||||
|
||||
success = 0
|
||||
for cid, ime, prezime, source_url, dob, mjesto in todo:
|
||||
m = re.match(r"https://(\w+)\.wikipedia\.org/wiki/(.+)", source_url)
|
||||
if not m: continue
|
||||
lang = m.group(1)
|
||||
raw = m.group(2)
|
||||
title = urllib.parse.unquote(raw).replace("_"," ")
|
||||
|
||||
qid = get_wikidata_id(title, lang)
|
||||
time.sleep(DELAY)
|
||||
if not qid:
|
||||
# Try alternate lang
|
||||
alt = "en" if lang == "hr" else "hr"
|
||||
qid = get_wikidata_id(title, alt)
|
||||
time.sleep(DELAY)
|
||||
if not qid:
|
||||
print(f" ✗ {ime} {prezime}: no Q-id"); continue
|
||||
|
||||
entity = get_wikidata_entity(qid)
|
||||
time.sleep(DELAY)
|
||||
parsed = parse_birth(entity)
|
||||
if not parsed:
|
||||
print(f" ✗ {ime} {prezime} ({qid}): no birth data"); continue
|
||||
|
||||
sets, vals = [], []
|
||||
if parsed.get("datum_rodenja") and not dob:
|
||||
sets.append("datum_rodenja = %s"); vals.append(parsed["datum_rodenja"])
|
||||
if parsed.get("mjesto_rodenja") and not mjesto:
|
||||
sets.append("mjesto_rodenja = %s"); vals.append(parsed["mjesto_rodenja"])
|
||||
if sets:
|
||||
sets.append("source_synced_at = now()")
|
||||
vals.append(cid)
|
||||
cr.execute(f"UPDATE pgz_sport.clanovi SET {', '.join(sets)} WHERE id = %s", tuple(vals))
|
||||
success += 1
|
||||
flags = []
|
||||
if parsed.get("datum_rodenja"): flags.append(f"DOB={parsed['datum_rodenja']}")
|
||||
if parsed.get("mjesto_rodenja"): flags.append(f"M={parsed['mjesto_rodenja']}")
|
||||
print(f" ✓ {ime} {prezime} ({qid}): {' '.join(flags)}")
|
||||
|
||||
print(f"\n=== Round 2 Updated: {success}/{len(todo)} ===")
|
||||
|
||||
# Re-stats
|
||||
cr.execute("""SELECT
|
||||
count(*) FILTER (WHERE datum_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS dob,
|
||||
count(*) FILTER (WHERE mjesto_rodenja IS NOT NULL AND source_url LIKE '%wikipedia%') AS mjesto,
|
||||
count(*) FILTER (WHERE source_url LIKE '%wikipedia%') AS total
|
||||
FROM pgz_sport.clanovi""")
|
||||
r = cr.fetchone()
|
||||
print(f"Final wiki: dob={r[0]} mjesto={r[1]} of total {r[2]}")
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user