#!/usr/bin/env python3 """Reingest godišnjaka 2006-2024 — full text from PDFs.""" import os, re, hashlib, subprocess, requests, psycopg2 from datetime import date DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3', user='rinet', password=os.environ["DB_PASSWORD"]) UA = "RiNET-Civic/1.0 (https://rinet.one)" BASE = "https://sport-pgz.hr/upload/dokumenti" GODISNJACI = [ ("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"), ("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"), ("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"), ("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"), ("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"), ("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"), ("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"), ("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"), ("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"), ("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"), ("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"), ("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"), ("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"), ("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"), ("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"), ("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"), ("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"), ("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"), ] OUT_DIR = "/opt/pgz-sport/_data/godisnjaci" os.makedirs(OUT_DIR, exist_ok=True) s = requests.Session() s.headers.update({"User-Agent": UA}) conn = psycopg2.connect(**DB); conn.autocommit = True cu = conn.cursor() for year, url in GODISNJACI: pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf" txt_path = f"{OUT_DIR}/godisnjak_{year}.txt" # Download if missing if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000: print(f" [{year}] downloading from {url}") try: r = s.get(url, timeout=120) if r.status_code != 200: print(f" [{year}] HTTP {r.status_code}, skip"); continue with open(pdf_path, "wb") as f: f.write(r.content) print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB") except Exception as e: print(f" [{year}] download failed: {e}"); continue # Extract text via pdftotext if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000: print(f" [{year}] extracting text…") try: subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path], check=True, timeout=300, capture_output=True) except Exception as e: print(f" [{year}] pdftotext failed: {e}"); continue # Read text try: with open(txt_path, encoding='utf-8', errors='replace') as f: text = f.read() except Exception as e: print(f" [{year}] read failed: {e}"); continue if len(text) < 5000: print(f" [{year}] text too short ({len(text)} chars), skip"); continue sha = hashlib.sha256(text.encode()).hexdigest()[:40] pages = text.count(chr(12)) + 1 title = f"Sportski godišnjak Zajednice sportova PGŽ {year}" if year in ("2023", "2024"): title = f"Sportski godišnjak ZS PGŽ {year} (web)" izdano = date(int(year), 12, 31) # Update existing or insert new cu.execute("""SELECT id FROM pgz_sport.dokumenti WHERE (LOWER(title) LIKE %s OR fname LIKE %s) ORDER BY id LIMIT 1""", (f"%godisnjak%{year}%", f"%godisnjak_{year}%")) existing = cu.fetchone() if existing: cu.execute("""UPDATE pgz_sport.dokumenti SET title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s, url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s, kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now() WHERE id=%s""", (title, text, sha, int(year), izdano, url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ', f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova", ['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'], existing[0])) print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages") else: cu.execute("""INSERT INTO pgz_sport.dokumenti (title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url, vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""", (title, text, sha, int(year), izdano, url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ', f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova", ['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'])) new_id = cu.fetchone()[0] print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages") # Final cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti WHERE vrsta='godisnjak' ORDER BY godina""") print("\n=== Godišnjaci u DB ===") for r in cu.fetchall(): print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})") conn.close()