#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv('/opt/rinet-gpu/.env.master') # auto-added by patch_scrapers_with_dotenv.sh """ Godišnjak pipeline v2 — popravljen za pravu shemu. """ import os, sys, hashlib, requests, re from pathlib import Path import psycopg2 from psycopg2.extras import RealDictCursor import pypdf DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci') UPLOAD_DIR.mkdir(parents=True, exist_ok=True) # 18 godišnjaka 2006-2024 (otkriveni scrapeom) GODISNJAK_URLS = [ ("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010), ("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015), ("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020), ("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021), ("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022), ("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023), ("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024), ] def download_pdf(url, dest): if dest.exists() and dest.stat().st_size > 1000: print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)") return dest try: r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True) if r.status_code == 200 and len(r.content) > 1000: dest.write_bytes(r.content) return dest else: print(f" ✗ HTTP {r.status_code}") except Exception as e: print(f" ERR: {e}") return None def parse_pdf(path): try: r = pypdf.PdfReader(str(path)) text = '' for p in r.pages: try: text += (p.extract_text() or '') + '\n' except: pass return text, len(r.pages) except Exception as e: return '', 0 def main(): conn = psycopg2.connect(DSN); conn.autocommit = True # Get chunks table column name with conn.cursor() as cur: cur.execute(""" SELECT column_name FROM information_schema.columns WHERE table_schema='pgz_sport' AND table_name='dokument_chunks' """) cols = [r[0] for r in cur.fetchall()] print(f"dokument_chunks columns: {cols}") parsed_count = 0 for url, godina in GODISNJAK_URLS: title = f"Sportski godišnjak ZSPGZ {godina}" fname = f"sportski-godisnjak-{godina}.pdf" dest = UPLOAD_DIR / fname print(f"\n📄 {title}") downloaded = download_pdf(url, dest) if not downloaded: continue # Compute SHA1 sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest() text, pages = parse_pdf(downloaded) if not text: print(f" ✗ parse failed") continue print(f" ✓ {pages} pages, {len(text)} chars") # UPSERT u dokumenti with conn.cursor(cursor_factory=RealDictCursor) as cur: # Check if exists by sha1 cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,)) existing = cur.fetchone() if existing: doc_id = existing['id'] cur.execute(""" UPDATE pgz_sport.dokumenti SET title = %s, godina = %s, vrsta = 'sportski-godisnjak', url = %s, pdf_url = %s, sadrzaj = %s, sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska', organizacija = 'Zajednica sportova Primorsko-goranske županije', izvor_url = %s, last_updated = now() WHERE id = %s """, (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id)) print(f" ↻ UPDATE id={doc_id}") else: cur.execute(""" INSERT INTO pgz_sport.dokumenti (title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj, sluzbeni_glasnik, razina, organizacija, izvor_url) VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s, 'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr') RETURNING id """, (title, fname, godina, url, url, sha1, text[:500000])) doc_id = cur.fetchone()['id'] print(f" + INSERT id={doc_id}") # Chunks (proper schema) if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols: with conn.cursor() as cur: cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,)) # Find INSERT pattern by columns idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page')) content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj')) chunks = [text[i:i+1000] for i in range(0, len(text), 800)] for i, ch in enumerate(chunks[:300]): if len(ch.strip()) > 50: try: cur.execute(f""" INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col}) VALUES (%s, %s, %s) """, (doc_id, i, ch)) except Exception as e: print(f" ERR chunk {i}: {e}"); break parsed_count += 1 print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}") if __name__ == '__main__': main()