Files
pgz-sport/scripts/godisnjak_pipeline_v2.py

154 lines
7.0 KiB
Python
Executable File

#!/usr/bin/env python3
from dotenv import load_dotenv
load_dotenv('/opt/rinet-gpu/.env.master')
# auto-added by patch_scrapers_with_dotenv.sh
"""
Godišnjak pipeline v2 — popravljen za pravu shemu.
"""
import os, sys, hashlib, requests, re
from pathlib import Path
import psycopg2
from psycopg2.extras import RealDictCursor
import pypdf
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
UPLOAD_DIR = Path('/opt/pgz-sport/uploads/godisnjaci')
UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
# 18 godišnjaka 2006-2024 (otkriveni scrapeom)
GODISNJAK_URLS = [
("https://sport-pgz.hr/upload/dokumenti/publikacije/godisnjak-2006-print.pdf", 2006),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2007.pdf", 2007),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2008.pdf", 2008),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2009.pdf", 2009),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2010.pdf", 2010),
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf", 2011),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2012.pdf", 2012),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2013.pdf", 2013),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2014.pdf", 2014),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2015.pdf", 2015),
("https://sport-pgz.hr/upload/dokumenti/publikacije/sportski-godisnjak-2017.pdf", 2017),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2018.pdf", 2018),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2019.pdf", 2019),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2020.pdf", 2020),
("https://sport-pgz.hr/upload/dokumenti/publikacije/Sportski-godisnjak-2021.pdf", 2021),
("https://sport-pgz.hr/upload/dokumenti/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf", 2022),
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf", 2023),
("https://sport-pgz.hr/upload/dokumenti/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf", 2024),
]
def download_pdf(url, dest):
if dest.exists() and dest.stat().st_size > 1000:
print(f" [cached] {dest.name} ({dest.stat().st_size//1024}KB)")
return dest
try:
r = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=120, allow_redirects=True)
if r.status_code == 200 and len(r.content) > 1000:
dest.write_bytes(r.content)
return dest
else:
print(f" ✗ HTTP {r.status_code}")
except Exception as e:
print(f" ERR: {e}")
return None
def parse_pdf(path):
try:
r = pypdf.PdfReader(str(path))
text = ''
for p in r.pages:
try: text += (p.extract_text() or '') + '\n'
except: pass
return text, len(r.pages)
except Exception as e:
return '', 0
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
# Get chunks table column name
with conn.cursor() as cur:
cur.execute("""
SELECT column_name FROM information_schema.columns
WHERE table_schema='pgz_sport' AND table_name='dokument_chunks'
""")
cols = [r[0] for r in cur.fetchall()]
print(f"dokument_chunks columns: {cols}")
parsed_count = 0
for url, godina in GODISNJAK_URLS:
title = f"Sportski godišnjak ZSPGZ {godina}"
fname = f"sportski-godisnjak-{godina}.pdf"
dest = UPLOAD_DIR / fname
print(f"\n📄 {title}")
downloaded = download_pdf(url, dest)
if not downloaded:
continue
# Compute SHA1
sha1 = hashlib.sha1(downloaded.read_bytes()).hexdigest()
text, pages = parse_pdf(downloaded)
if not text:
print(f" ✗ parse failed")
continue
print(f"{pages} pages, {len(text)} chars")
# UPSERT u dokumenti
with conn.cursor(cursor_factory=RealDictCursor) as cur:
# Check if exists by sha1
cur.execute("SELECT id FROM pgz_sport.dokumenti WHERE sha1 = %s LIMIT 1", (sha1,))
existing = cur.fetchone()
if existing:
doc_id = existing['id']
cur.execute("""
UPDATE pgz_sport.dokumenti
SET title = %s, godina = %s, vrsta = 'sportski-godisnjak',
url = %s, pdf_url = %s, sadrzaj = %s,
sluzbeni_glasnik = 'ZSPGZ', razina = 'zupanijska',
organizacija = 'Zajednica sportova Primorsko-goranske županije',
izvor_url = %s, last_updated = now()
WHERE id = %s
""", (title, godina, url, url, text[:500000], 'https://sport-pgz.hr', doc_id))
print(f" ↻ UPDATE id={doc_id}")
else:
cur.execute("""
INSERT INTO pgz_sport.dokumenti
(title, fname, vrsta, godina, url, pdf_url, sha1, sadrzaj,
sluzbeni_glasnik, razina, organizacija, izvor_url)
VALUES (%s, %s, 'sportski-godisnjak', %s, %s, %s, %s, %s,
'ZSPGZ', 'zupanijska', 'Zajednica sportova PGŽ', 'https://sport-pgz.hr')
RETURNING id
""", (title, fname, godina, url, url, sha1, text[:500000]))
doc_id = cur.fetchone()['id']
print(f" + INSERT id={doc_id}")
# Chunks (proper schema)
if 'idx' in cols or 'chunk_idx' in cols or 'page' in cols or 'i' in cols:
with conn.cursor() as cur:
cur.execute("DELETE FROM pgz_sport.dokument_chunks WHERE dokument_id = %s", (doc_id,))
# Find INSERT pattern by columns
idx_col = 'idx' if 'idx' in cols else ('chunk_idx' if 'chunk_idx' in cols else ('i' if 'i' in cols else 'page'))
content_col = 'content' if 'content' in cols else ('chunk' if 'chunk' in cols else ('text' if 'text' in cols else 'sadrzaj'))
chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
for i, ch in enumerate(chunks[:300]):
if len(ch.strip()) > 50:
try:
cur.execute(f"""
INSERT INTO pgz_sport.dokument_chunks (dokument_id, {idx_col}, {content_col})
VALUES (%s, %s, %s)
""", (doc_id, i, ch))
except Exception as e:
print(f" ERR chunk {i}: {e}"); break
parsed_count += 1
print(f"\n✅ Done. Parsed: {parsed_count}/{len(GODISNJAK_URLS)}")
if __name__ == '__main__':
main()