1d02c0897d
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
151 lines
6.9 KiB
Python
Executable File
151 lines
6.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Godišnjak pipeline v2 — popravljen za pravu shemu.
|
|
"""
|
|
import os, sys, hashlib, requests, re
|
|
from pathlib import Path
|
|
import psycopg2
|
|
from psycopg2.extras import RealDictCursor
|
|
import pypdf
|
|
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
|
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
|
|
GODISNJAK_URLS = [
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
|
|
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
|
|
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
|
|
]
|
|
|
|
def download_pdf(url, dest):
|
|
if dest.exists() and dest.stat().st_size > 1000:
|
|
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
|
|
return dest
|
|
try:
|
|
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
|
|
if r.status_code == 200 and len(r.content) > 1000:
|
|
dest.write_bytes(r.content)
|
|
return dest
|
|
else:
|
|
print(f" ✗ HTTP {r.status_code}")
|
|
except Exception as e:
|
|
print(f" ERR: {e}")
|
|
return None
|
|
|
|
def parse_pdf(path):
|
|
try:
|
|
r = pypdf.PdfReader(str(path))
|
|
text = ''
|
|
for p in r.pages:
|
|
try: text += (p.extract_text() or '') + '\n'
|
|
except: pass
|
|
return text, len(r.pages)
|
|
except Exception as e:
|
|
return '', 0
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
|
|
# Get chunks table column name
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT column_name FROM information_schema.columns
|
|
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
|
|
""")
|
|
cols = [r[0] for r in cur.fetchall()]
|
|
|
|
print(f"dokument_chunks columns: {cols}")
|
|
|
|
parsed_count = 0
|
|
for url, godina in GODISNJAK_URLS:
|
|
title = f"Sportski godišnjak ZSPGZ {godina}"
|
|
fname = f"sportski-godisnjak-{godina}.pdf"
|
|
dest = UPLOAD_DIR / fname
|
|
|
|
print(f"\n📄 {title}")
|
|
downloaded = download_pdf(url, dest)
|
|
if not downloaded:
|
|
continue
|
|
|
|
# Compute SHA1
|
|
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
|
|
|
|
text, pages = parse_pdf(downloaded)
|
|
if not text:
|
|
print(f" ✗ parse failed")
|
|
continue
|
|
print(f" ✓ {pages} pages, {len(text)} chars")
|
|
|
|
# UPSERT u dokumenti
|
|
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
|
# Check if exists by sha1
|
|
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
|
|
existing = cur.fetchone()
|
|
|
|
if existing:
|
|
doc_id = existing['id']
|
|
cur.execute("""
|
|
UPDATE pgz_sport.dokumenti
|
|
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
|
|
url = %s, pdf_url = %s, sadrzaj = %s,
|
|
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
|
|
organizacija = 'Zajednica sportova Primorsko-goranske županije',
|
|
izvor_url = %s, last_updated = now()
|
|
WHERE id = %s
|
|
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
|
|
print(f" ↻ UPDATE id={doc_id}")
|
|
else:
|
|
cur.execute("""
|
|
INSERT INTO pgz_sport.dokumenti
|
|
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
|
|
sluzbeni_glasnik, razina, organizacija, izvor_url)
|
|
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
|
|
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
|
|
RETURNING id
|
|
""", (title, fname, godina, url, url, sha1, text[:500000]))
|
|
doc_id = cur.fetchone()['id']
|
|
print(f" + INSERT id={doc_id}")
|
|
|
|
# Chunks (proper schema)
|
|
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
|
|
with conn.cursor() as cur:
|
|
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
|
|
# Find INSERT pattern by columns
|
|
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
|
|
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
|
|
|
|
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
|
for i, ch in enumerate(chunks[:300]):
|
|
if len(ch.strip()) > 50:
|
|
try:
|
|
cur.execute(f"""
|
|
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
|
|
VALUES (%s, %s, %s)
|
|
""", (doc_id, i, ch))
|
|
except Exception as e:
|
|
print(f" ERR chunk {i}: {e}"); break
|
|
|
|
parsed_count += 1
|
|
|
|
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|