#!/usr/bin/env python3 """ Godišnjak pipeline: 1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr 2. Download PDF lokalno 3. Parse text iz PDF 4. UPDATE pgz_sport.dokumenti SET sadrzaj = parsed_text 5. Save chunks za RAG """ import os, sys, hashlib, requests, re from pathlib import Path import psycopg2 from psycopg2.extras import RealDictCursor import pypdf DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci') UPLOAD_DIR.mkdir(parents=True, exist_ok=True) def download_pdf(url, dest): if dest.exists() and dest.stat().st_size > 1000: return dest try: r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60, allow_redirects=True) if r.status_code == 200 and len(r.content) > 1000: dest.write_bytes(r.content) return dest except Exception as e: print(f" ERR download {url}: {e}") return None def parse_pdf(path): try: r = pypdf.PdfReader(str(path)) text = '' for p in r.pages: try: text += (p.extract_text() or '') + '\n' except: pass return text, len(r.pages) except Exception as e: print(f" ERR parse {path}: {e}") return '', 0 def main(): conn = psycopg2.connect(DSN); conn.autocommit = True # 1. Get all godišnjaci s pdf_url ili url s .pdf with conn.cursor(cursor_factory=RealDictCursor) as cur: cur.execute(""" SELECT id, title, url, pdf_url, vrsta, sadrzaj FROM pgz_sport.dokumenti WHERE ( title ILIKE '%sportski godi%njak%' OR title ILIKE '%godi%njak HNS%' OR title ILIKE 'ZSPGZ%' OR title ILIKE '%godi%njak ZSPGZ%' OR url ILIKE '%godisnjak%.pdf' OR pdf_url ILIKE '%godisnjak%.pdf' OR title ILIKE '%godi%njak%' AND (url ILIKE '%pdf' OR pdf_url IS NOT NULL) ) ORDER BY id DESC """) targets = cur.fetchall() print(f"Targets: {len(targets)}") parsed_count = 0 for t in targets: url = t['pdf_url'] or t['url'] if not url or not url.lower().endswith('.pdf'): continue if t['sadrzaj'] and len(t['sadrzaj']) > 500: print(f" ⏭ ID {t['id']}: already parsed ({len(t['sadrzaj'])} chars)") continue print(f" 📄 ID {t['id']}: {t['title'][:60]}") fname = re.sub(r'[^\w.-]', '_', os.path.basename(url))[:100] dest = UPLOAD_DIR / f"{t['id']}_{fname}" downloaded = download_pdf(url, dest) if not downloaded: continue text, pages = parse_pdf(downloaded) if not text: continue print(f" ✓ {pages} pages, {len(text)} chars") # UPDATE sadrzaj with conn.cursor() as cur: cur.execute(""" UPDATE pgz_sport.dokumenti SET sadrzaj = %s, last_updated = now() WHERE id = %s """, (text[:500000], t['id'])) # cap 500K # Chunks za RAG (1000 chars each) with conn.cursor() as cur: cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (t['id'],)) chunks = [text[i:i+1000] for i in range(0, len(text), 800)] for i, ch in enumerate(chunks[:200]): if len(ch.strip()) > 50: cur.execute(""" INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_idx, content) VALUES (%s, %s, %s) """, (t['id'], i, ch)) parsed_count += 1 print(f"\nDone. Parsed: {parsed_count}") if __name__ == '__main__': main()