1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
113 lines
3.8 KiB
Python
Executable File
113 lines
3.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Godišnjak pipeline:
|
|
1. Find godišnjak PDFs in DB (table dokumenti) + scrape sport-pgz.hr
|
|
2. Download PDF lokalno
|
|
3. Parse text iz PDF
|
|
4. UPDATE pgz_sport.dokumenti SET sadrzaj = parsed_text
|
|
5. Save chunks za RAG
|
|
"""
|
|
import os, sys, hashlib, requests, re
|
|
from pathlib import Path
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
import pypdf
|
|
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
|
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
def download_pdf(url, dest):
|
|
if dest.exists() and dest.stat().st_size > 1000:
|
|
return dest
|
|
try:
|
|
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=60, allow_redirects=True)
|
|
if r.status_code == 200 and len(r.content) > 1000:
|
|
dest.write_bytes(r.content)
|
|
return dest
|
|
except Exception as e:
|
|
print(f" ERR download {url}: {e}")
|
|
return None
|
|
|
|
def parse_pdf(path):
|
|
try:
|
|
r = pypdf.PdfReader(str(path))
|
|
text = ''
|
|
for p in r.pages:
|
|
try: text += (p.extract_text() or '') + '\n'
|
|
except: pass
|
|
return text, len(r.pages)
|
|
except Exception as e:
|
|
print(f" ERR parse {path}: {e}")
|
|
return '', 0
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
# 1. Get all godišnjaci s pdf_url ili url s .pdf
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
cur.execute("""
|
|
SELECT id, title, url, pdf_url, vrsta, sadrzaj
|
|
FROM pgz_sport.dokumenti
|
|
WHERE (
|
|
title ILIKE '%sportski godi%njak%' OR title ILIKE '%godi%njak HNS%'
|
|
OR title ILIKE 'ZSPGZ%' OR title ILIKE '%godi%njak ZSPGZ%'
|
|
OR url ILIKE '%godisnjak%.pdf' OR pdf_url ILIKE '%godisnjak%.pdf'
|
|
OR title ILIKE '%godi%njak%' AND (url ILIKE '%pdf' OR pdf_url IS NOT NULL)
|
|
)
|
|
ORDER BY id DESC
|
|
""")
|
|
targets = cur.fetchall()
|
|
|
|
print(f"Targets: {len(targets)}")
|
|
|
|
parsed_count = 0
|
|
for t in targets:
|
|
url = t['pdf_url'] or t['url']
|
|
if not url or not url.lower().endswith('.pdf'):
|
|
continue
|
|
|
|
if t['sadrzaj'] and len(t['sadrzaj']) > 500:
|
|
print(f" ⏭ ID {t['id']}: already parsed ({len(t['sadrzaj'])} chars)")
|
|
continue
|
|
|
|
print(f" 📄 ID {t['id']}: {t['title'][:60]}")
|
|
fname = re.sub(r'[^\w.-]', '_', os.path.basename(url))[:100]
|
|
dest = UPLOAD_DIR / f"{t['id']}_{fname}"
|
|
|
|
downloaded = download_pdf(url, dest)
|
|
if not downloaded:
|
|
continue
|
|
|
|
text, pages = parse_pdf(downloaded)
|
|
if not text:
|
|
continue
|
|
|
|
print(f" ✓ {pages} pages, {len(text)} chars")
|
|
|
|
# UPDATE sadrzaj
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
UPDATE pgz_sport.dokumenti
|
|
SET sadrzaj = %s, last_updated = now()
|
|
WHERE id = %s
|
|
""", (text[:500000], t['id'])) # cap 500K
|
|
|
|
# Chunks za RAG (1000 chars each)
|
|
with conn.cursor() as cur:
|
|
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (t['id'],))
|
|
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
|
for i, ch in enumerate(chunks[:200]):
|
|
if len(ch.strip()) > 50:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.dokument_chunks (dokument_id, chunk_idx, content)
|
|
VALUES (%s, %s, %s)
|
|
""", (t['id'], i, ch))
|
|
|
|
parsed_count += 1
|
|
|
|
print(f"\nDone. Parsed: {parsed_count}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|