Files
pgz-sport/scrapers/godisnjak_full_ingest.py
T

122 lines
5.7 KiB
Python
Executable File

#!/usr/bin/env python3
"""Reingest godišnjaka 2006-2024 — full text from PDFs."""
import os, re, hashlib, subprocess, requests, psycopg2
from datetime import date
DB = dict(host="10.10.0.2", port=6432, port=5432, dbname='rinet_v3',
user='rinet', password='R1net2026!SecureDB#v7')
UA = "RiNET-Civic/1.0 (https://rinet.one)"
BASE = "https://sport-pgz.hr/upload/dokumenti"
GODISNJACI = [
("2006", f"{BASE}/publikacije/godisnjak-2006-print.pdf"),
("2007", f"{BASE}/publikacije/Sportski-godisnjak-2007.pdf"),
("2008", f"{BASE}/publikacije/Sportski-godisnjak-2008.pdf"),
("2009", f"{BASE}/publikacije/Sportski-godisnjak-2009.pdf"),
("2010", f"{BASE}/publikacije/Sportski-godisnjak-2010.pdf"),
("2011", f"{BASE}/publikacije/sportski-godisnjak-2011_2025-10-07-125709_xcqb.pdf"),
("2012", f"{BASE}/publikacije/Sportski-godisnjak-2012.pdf"),
("2013", f"{BASE}/publikacije/Sportski-godisnjak-2013.pdf"),
("2014", f"{BASE}/publikacije/Sportski-godisnjak-2014.pdf"),
("2015", f"{BASE}/publikacije/Sportski-godisnjak-2015.pdf"),
("2017", f"{BASE}/publikacije/sportski-godisnjak-2017.pdf"),
("2018", f"{BASE}/publikacije/Sportski-godisnjak-2018.pdf"),
("2019", f"{BASE}/publikacije/Sportski-godisnjak-2019.pdf"),
("2020", f"{BASE}/publikacije/Sportski-godisnjak-2020.pdf"),
("2021", f"{BASE}/publikacije/Sportski-godisnjak-2021.pdf"),
("2022", f"{BASE}/sportski-godisnjaci/ZSPGZ-Sportski-godisnjak-2022.pdf"),
("2023", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2023-2024-06-07_web.pdf"),
("2024", f"{BASE}/publikacije/ZSPGZ-Sportski-godisnjak-2024-2025-09-18_web.pdf"),
]
OUT_DIR = "/opt/pgz-sport/_data/godisnjaci"
os.makedirs(OUT_DIR, exist_ok=True)
s = requests.Session()
s.headers.update({"User-Agent": UA})
conn = psycopg2.connect(**DB); conn.autocommit = True
cu = conn.cursor()
for year, url in GODISNJACI:
pdf_path = f"{OUT_DIR}/godisnjak_{year}.pdf"
txt_path = f"{OUT_DIR}/godisnjak_{year}.txt"
# Download if missing
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) < 100000:
print(f" [{year}] downloading from {url}")
try:
r = s.get(url, timeout=120)
if r.status_code != 200:
print(f" [{year}] HTTP {r.status_code}, skip"); continue
with open(pdf_path, "wb") as f: f.write(r.content)
print(f" [{year}] downloaded {len(r.content)/1024/1024:.1f} MB")
except Exception as e:
print(f" [{year}] download failed: {e}"); continue
# Extract text via pdftotext
if not os.path.exists(txt_path) or os.path.getsize(txt_path) < 1000:
print(f" [{year}] extracting text…")
try:
subprocess.run(["pdftotext", "-layout", "-enc", "UTF-8", pdf_path, txt_path],
check=True, timeout=300, capture_output=True)
except Exception as e:
print(f" [{year}] pdftotext failed: {e}"); continue
# Read text
try:
with open(txt_path, encoding='utf-8', errors='replace') as f:
text = f.read()
except Exception as e:
print(f" [{year}] read failed: {e}"); continue
if len(text) < 5000:
print(f" [{year}] text too short ({len(text)} chars), skip"); continue
sha = hashlib.sha256(text.encode()).hexdigest()[:40]
pages = text.count(chr(12)) + 1
title = f"Sportski godišnjak Zajednice sportova PGŽ {year}"
if year in ("2023", "2024"):
title = f"Sportski godišnjak ZS PGŽ {year} (web)"
izdano = date(int(year), 12, 31)
# Update existing or insert new
cu.execute("""SELECT id FROM pgz_sport.dokumenti
WHERE (LOWER(title) LIKE %s OR fname LIKE %s)
ORDER BY id LIMIT 1""",
(f"%godisnjak%{year}%", f"%godisnjak_{year}%"))
existing = cu.fetchone()
if existing:
cu.execute("""UPDATE pgz_sport.dokumenti SET
title=%s, sadrzaj=%s, sha1=%s, godina=%s, izdano_datum=%s,
url=%s, pdf_url=%s, izvor_url=%s, vrsta=%s, organizacija=%s, razina=%s,
kratak_opis=%s, kljucne_rijeci=%s, scraped_at=now()
WHERE id=%s""",
(title, text, sha, int(year), izdano,
url, url, url, 'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši'],
existing[0]))
print(f" [{year}] ✓ UPDATED dok #{existing[0]}: {len(text)} chars, {pages} pages")
else:
cu.execute("""INSERT INTO pgz_sport.dokumenti
(title, sadrzaj, sha1, godina, izdano_datum, url, pdf_url, izvor_url,
vrsta, organizacija, razina, kratak_opis, kljucne_rijeci, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,true) RETURNING id""",
(title, text, sha, int(year), izdano, url, url, url,
'godisnjak', 'Zajednica sportova PGŽ', 'PGŽ',
f"Godišnji bilten ZS PGŽ {year} - {pages} stranica, {len(text)//1000} k znakova",
['godišnjak', 'ZS PGŽ', year, 'PGŽ', 'klubovi', 'savezi', 'sportaši']))
new_id = cu.fetchone()[0]
print(f" [{year}] ✓ INSERTED dok #{new_id}: {len(text)} chars, {pages} pages")
# Final
cu.execute("""SELECT godina, length(sadrzaj), pdf_url FROM pgz_sport.dokumenti
WHERE vrsta='godisnjak' ORDER BY godina""")
print("\n=== Godišnjaci u DB ===")
for r in cu.fetchall():
print(f" {r[0]}: {r[1]:,} chars ({r[2][:80]})")
conn.close()