Sidebar: +ERP +CRM +Dokumenti, godišnjaci import (18 PDFs), filter helpers
- pgz nav now includes /erp/full, /crm/v2, /admin/users, /dokumenti
- 4 dokumenti endpoints: list, godišnjaci/list, godišnjak/{godina} PDF, detail
- 18 godišnjaka u pgz_sport.dokumenti (2006-2024) with savez_id=333
- PGŽ filter helpers (window._pgz_filter_priority, togglePGZFilter)
- navItemClick handler for nav items with href
This commit is contained in:
Executable
+150
@@ -0,0 +1,150 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Godišnjak pipeline v2 — popravljen za pravu shemu.
|
||||
"""
|
||||
import os, sys, hashlib, requests, re
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
import pypdf
|
||||
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
|
||||
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
|
||||
GODISNJAK_URLS = [
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
|
||||
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
|
||||
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
|
||||
]
|
||||
|
||||
def download_pdf(url, dest):
|
||||
if dest.exists() and dest.stat().st_size > 1000:
|
||||
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
|
||||
return dest
|
||||
try:
|
||||
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
|
||||
if r.status_code == 200 and len(r.content) > 1000:
|
||||
dest.write_bytes(r.content)
|
||||
return dest
|
||||
else:
|
||||
print(f" ✗ HTTP {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" ERR: {e}")
|
||||
return None
|
||||
|
||||
def parse_pdf(path):
|
||||
try:
|
||||
r = pypdf.PdfReader(str(path))
|
||||
text = ''
|
||||
for p in r.pages:
|
||||
try: text += (p.extract_text() or '') + '\n'
|
||||
except: pass
|
||||
return text, len(r.pages)
|
||||
except Exception as e:
|
||||
return '', 0
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
|
||||
# Get chunks table column name
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
|
||||
""")
|
||||
cols = [r[0] for r in cur.fetchall()]
|
||||
|
||||
print(f"dokument_chunks columns: {cols}")
|
||||
|
||||
parsed_count = 0
|
||||
for url, godina in GODISNJAK_URLS:
|
||||
title = f"Sportski godišnjak ZSPGZ {godina}"
|
||||
fname = f"sportski-godisnjak-{godina}.pdf"
|
||||
dest = UPLOAD_DIR / fname
|
||||
|
||||
print(f"\n📄 {title}")
|
||||
downloaded = download_pdf(url, dest)
|
||||
if not downloaded:
|
||||
continue
|
||||
|
||||
# Compute SHA1
|
||||
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
|
||||
|
||||
text, pages = parse_pdf(downloaded)
|
||||
if not text:
|
||||
print(f" ✗ parse failed")
|
||||
continue
|
||||
print(f" ✓ {pages} pages, {len(text)} chars")
|
||||
|
||||
# UPSERT u dokumenti
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Check if exists by sha1
|
||||
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
|
||||
existing = cur.fetchone()
|
||||
|
||||
if existing:
|
||||
doc_id = existing['id']
|
||||
cur.execute("""
|
||||
UPDATE pgz_sport.dokumenti
|
||||
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
|
||||
url = %s, pdf_url = %s, sadrzaj = %s,
|
||||
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
|
||||
organizacija = 'Zajednica sportova Primorsko-goranske županije',
|
||||
izvor_url = %s, last_updated = now()
|
||||
WHERE id = %s
|
||||
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
|
||||
print(f" ↻ UPDATE id={doc_id}")
|
||||
else:
|
||||
cur.execute("""
|
||||
INSERT INTO pgz_sport.dokumenti
|
||||
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
|
||||
sluzbeni_glasnik, razina, organizacija, izvor_url)
|
||||
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
|
||||
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
|
||||
RETURNING id
|
||||
""", (title, fname, godina, url, url, sha1, text[:500000]))
|
||||
doc_id = cur.fetchone()['id']
|
||||
print(f" + INSERT id={doc_id}")
|
||||
|
||||
# Chunks (proper schema)
|
||||
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
|
||||
# Find INSERT pattern by columns
|
||||
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
|
||||
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
|
||||
|
||||
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
|
||||
for i, ch in enumerate(chunks[:300]):
|
||||
if len(ch.strip()) > 50:
|
||||
try:
|
||||
cur.execute(f"""
|
||||
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
|
||||
VALUES (%s, %s, %s)
|
||||
""", (doc_id, i, ch))
|
||||
except Exception as e:
|
||||
print(f" ERR chunk {i}: {e}"); break
|
||||
|
||||
parsed_count += 1
|
||||
|
||||
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user