PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
+121
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
|
||||
import os, re, hashlib, subprocess, requests, psycopg2
|
||||
from datetime import date
|
||||
|
||||
DB = dict(host='localhost', port=5432, dbname='rinet_v3',
|
||||
user='rinet', password='R1net2026!SecureDB#v7')
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one)"
|
||||
BASE = "https://sport-pgz.hr/upload/dokumenti"
|
||||
|
||||
GODISNJACI = [
|
||||
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
|
||||
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
|
||||
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
|
||||
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
|
||||
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
|
||||
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
|
||||
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
|
||||
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
|
||||
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
|
||||
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
|
||||
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
|
||||
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
|
||||
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
|
||||
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
|
||||
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
|
||||
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
|
||||
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
|
||||
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
|
||||
]
|
||||
|
||||
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
|
||||
s = requests.Session()
|
||||
s.headers.update({"User-Agent": UA})
|
||||
|
||||
conn = psycopg2.connect(**DB); conn.autocommit = True
|
||||
cu = conn.cursor()
|
||||
|
||||
for year, url in GODISNJACI:
|
||||
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
|
||||
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
|
||||
|
||||
# Download if missing
|
||||
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
|
||||
print(f" [{year}] downloading from {url}")
|
||||
try:
|
||||
r = s.get(url, timeout=120)
|
||||
if r.status_code != 200:
|
||||
print(f" [{year}] HTTP {r.status_code}, skip"); continue
|
||||
with open(pdf_path, "wb") as f: f.write(r.content)
|
||||
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
|
||||
except Exception as e:
|
||||
print(f" [{year}] download failed: {e}"); continue
|
||||
|
||||
# Extract text via pdftotext
|
||||
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
|
||||
print(f" [{year}] extracting text…")
|
||||
try:
|
||||
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
|
||||
check=True, timeout=300, capture_output=True)
|
||||
except Exception as e:
|
||||
print(f" [{year}] pdftotext failed: {e}"); continue
|
||||
|
||||
# Read text
|
||||
try:
|
||||
with open(txt_path, encoding='utf-8', errors='replace') as f:
|
||||
text = f.read()
|
||||
except Exception as e:
|
||||
print(f" [{year}] read failed: {e}"); continue
|
||||
|
||||
if len(text) < 5000:
|
||||
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
|
||||
|
||||
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
|
||||
pages = text.count(chr(12)) + 1
|
||||
|
||||
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
|
||||
if year in ("2023", "2024"):
|
||||
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
|
||||
izdano = date(int(year), 12, 31)
|
||||
|
||||
# Update existing or insert new
|
||||
cu.execute("""SELECT id FROM pgz_sport.dokumenti
|
||||
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
|
||||
ORDER BY id LIMIT 1""",
|
||||
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
|
||||
existing = cu.fetchone()
|
||||
|
||||
if existing:
|
||||
cu.execute("""UPDATE pgz_sport.dokumenti SET
|
||||
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
|
||||
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
|
||||
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
|
||||
WHERE id=%s""",
|
||||
(title, text, sha, int(year), izdano,
|
||||
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
|
||||
existing[0]))
|
||||
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
|
||||
else:
|
||||
cu.execute("""INSERT INTO pgz_sport.dokumenti
|
||||
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
|
||||
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
|
||||
(title, text, sha, int(year), izdano, url, url, url,
|
||||
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
|
||||
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
|
||||
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
|
||||
new_id = cu.fetchone()[0]
|
||||
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
|
||||
|
||||
# Final
|
||||
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
|
||||
WHERE vrsta='godisnjak' ORDER BY godina""")
|
||||
print("\n=== Godišnjaci u DB ===")
|
||||
for r in cu.fetchall():
|
||||
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user