138 lines
5.6 KiB
Python
138 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
# Fajl: sport_rezultati_arhivar.py | v1.0.0 | 05.05.2026
|
|
# Lokacija: /opt/pgz-sport/scrapers/sport_rezultati_arhivar.py
|
|
# Svrha: Wikipedia HR sezone HNL + Kup HR po godinama
|
|
# - Iterate kroz sve sezone HNL od 1992
|
|
# - Wikipedia API pages: "1._HNL_2023/24", "Kup_Hrvatske_u_nogometu_2024/25"
|
|
# - Extract konacne tablice + finalne utakmice
|
|
# - Plus PGŽ klubovi: HNK Rijeka, Orijent, Crikvenica, Opatija
|
|
# ═══════════════════════════════════════════════════════════════════
|
|
"""Sport rezultati historical arhivar."""
|
|
import os, re, json, time, hashlib
|
|
import urllib.request, urllib.parse
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch
|
|
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UA = "Ri.NET Civic Bot 1.0 (contact: dradulic@outlook.com)"
|
|
API = "https://hr.wikipedia.org/w/api.php"
|
|
|
|
|
|
def wiki_extract(title, sentences=None):
|
|
params = {"action": "query", "prop": "extracts", "explaintext": "1",
|
|
"redirects": "1", "format": "json", "titles": title}
|
|
if sentences:
|
|
params["exsentences"] = str(sentences)
|
|
url = API + "?" + urllib.parse.urlencode(params)
|
|
req = urllib.request.Request(url, headers={"User-Agent": UA})
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as r:
|
|
d = json.loads(r.read())
|
|
for pid, p in d.get("query", {}).get("pages", {}).items():
|
|
if pid == "-1":
|
|
return None
|
|
return p.get("extract", "")
|
|
except Exception as e:
|
|
return None
|
|
|
|
|
|
def chunk(text, max_len=700):
|
|
if len(text) <= max_len:
|
|
return [text] if text else []
|
|
out = []; start = 0
|
|
while start < len(text):
|
|
end = min(start + max_len, len(text))
|
|
if end < len(text):
|
|
for sep in [". ", "! ", "? ", "\n"]:
|
|
p = text.rfind(sep, start, end)
|
|
if p > start + max_len // 2:
|
|
end = p + len(sep); break
|
|
out.append(text[start:end].strip())
|
|
start = end
|
|
return [c for c in out if len(c) > 80]
|
|
|
|
|
|
def insert_facts(conn, page, text, category, confidence=0.88):
|
|
if not text or len(text) < 200:
|
|
return 0
|
|
cur = conn.cursor()
|
|
rows = []
|
|
for c in chunk(text, 700):
|
|
h = hashlib.md5(c.encode()).hexdigest()
|
|
rows.append((c, "wikipedia_sport_arhiv", category, confidence, h,
|
|
json.dumps({"page": page})))
|
|
sql = """INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs)
|
|
VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING"""
|
|
try:
|
|
execute_batch(cur, sql, rows, page_size=50)
|
|
n = cur.rowcount; cur.close()
|
|
return n
|
|
except Exception as e:
|
|
return 0
|
|
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
pages = []
|
|
|
|
# 1. HNL sezone 1992-2024
|
|
for year in range(1992, 2026):
|
|
for fmt in [f"1._HNL_{year}.", f"1._HNL_{year}./{(year+1)%100:02d}.",
|
|
f"HNL_{year}/{(year+1)%100:02d}", f"HNL_{year}-{year+1}",
|
|
f"SuperSport_HNL_{year}./{(year+1)%100:02d}.",
|
|
f"HT_Prva_HNL_{year}./{(year+1)%100:02d}."]:
|
|
pages.append(("hnl_sezona", fmt))
|
|
|
|
# 2. Kup Hrvatske u nogometu (po godinama)
|
|
for year in range(1992, 2026):
|
|
for fmt in [f"Kup_Hrvatske_u_nogometu_{year}.",
|
|
f"Kup_Hrvatske_u_nogometu_{year}./{(year+1)%100:02d}.",
|
|
f"Hrvatski_nogometni_kup_{year}-{year+1}"]:
|
|
pages.append(("hr_nogometni_kup", fmt))
|
|
|
|
# 3. Glavni klubovi PGŽ + povijest
|
|
for klub in ["HNK_Rijeka", "NK_Orijent", "NK_Krk", "NK_Crikvenica",
|
|
"NK_Opatija", "NK_Mat-Promet", "NK_Pomorac", "NK_Naša_Slatina",
|
|
"HNK_Rijeka_(boys)", "ŽNK_Rijeka",
|
|
"HKK_Kvarner", "KK_Kvarner_2010", "KK_Lovran",
|
|
"HMRK_Zamet", "MRK_Pomorac", "RK_Trsat", "RK_Crikvenica",
|
|
"VK_Primorje", "VK_Rijeka",
|
|
"HRK_Rijeka", "HOK_Rijeka", "OK_Rijeka",
|
|
"HAOK_Mladost", "HAOK_Rijeka"]:
|
|
pages.append(("pgz_klub_povijest", klub))
|
|
|
|
# 4. Sezone HNK Rijeka po godinama
|
|
for year in range(1990, 2026):
|
|
for fmt in [f"Sezona_HNK_Rijeka_{year}./{(year+1)%100:02d}.",
|
|
f"HNK_Rijeka_u_sezoni_{year}-{year+1}",
|
|
f"HNK_Rijeka_{year}-{year+1}_sezona"]:
|
|
pages.append(("hnk_rijeka_sezona", fmt))
|
|
|
|
# Crawl
|
|
successful = 0
|
|
total_facts = 0
|
|
found_pages = []
|
|
|
|
for category, page in pages:
|
|
text = wiki_extract(page)
|
|
if text and len(text) > 300:
|
|
successful += 1
|
|
facts_inserted = insert_facts(conn, page, text, category, confidence=0.88)
|
|
total_facts += facts_inserted
|
|
found_pages.append(page)
|
|
if successful % 10 == 0:
|
|
print(f" progress: {successful} pages found, {total_facts} facts")
|
|
time.sleep(0.4) # rate limit
|
|
|
|
print(f"\n=== DONE: {successful}/{len(pages)} pages found, {total_facts} facts ===")
|
|
print(f"Sample found pages: {found_pages[:15]}")
|
|
conn.close()
|
|
return {"pages_found": successful, "pages_tried": len(pages),
|
|
"facts": total_facts}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print(json.dumps(main()))
|