CC4: fix 3 outstanding bugs (dokumenti dup, upload, kategorizirani SQL)

Bug #1 — `/api/v2/dokumenti/{did:int}` duplicate registration (route shadowing):
- Bila dva @router.get-a za isti path, drugi (s chunks) bio dead code
- Renamed duplicate na /dokumenti/{did}/full → eksplicitan RAG-rich varianta
- /dokumenti/{did} ostaje simple (8 osnovnih polja), /full vraća chunks za RAG

Bug #2 — `/api/v2/dokumenti/upload` MISSING:
- Dodan @router.post("/dokumenti/upload") s multipart formom
- Polja: file, title, vrsta, razina, organizacija, sport, izvor_url, godina, kratak_opis
- Tekst ekstrakcija: pdftotext za PDF, raw decode za TXT
- Pohrana: /opt/pgz-sport/_data/dokumenti_uploads/{ts}_{sha12}_{safe_fname}
- Insert u pgz_sport.dokumenti (vraća dokument_id, fname, chars, sha12)
- Audit log entry preko erp.audit_helper
- Validacija: max 32 MB, dozvoljeno PDF/DOC/DOCX/TXT/RTF

Bug #3 — SQL alias bug u /api/v2/kategorizirani/list:
- Već popravljen u Sub1 commitu eb1b49f
- Verify: /kategorizirani/list i /kategorizirani/by-sport oba 200, count=270

Smoke 5/5 ✓:
- GET /v2/dokumenti                    -> 200
- GET /v2/dokumenti/1577 (simple)      -> 200, 8 keys, no chunks
- GET /v2/dokumenti/1577/full (RAG)    -> 200, has_chunks, count=0
- POST /v2/dokumenti/upload (multipart)-> 200, doc_id=31214, chars=50, size=52
- GET /v2/kategorizirani/list          -> 200, 270 redova

/api/debug/dashboard:
- services: pgz-sport, pgz-debug-tail, pgz-auto-triage, nginx ACTIVE
- db: ok
- 0 novih grešaka nakon restart-a (samo stari nginx-access 502 iz prijašnjih restarta)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
CC4
2026-05-05 08:52:47 +02:00
parent 3e60e5095a
commit 125ba6dbfb
+76 -3
View File
@@ -2242,6 +2242,78 @@ def list_dokumenti(razina: Optional[str] = None, vrsta: Optional[str] = None,
rows = db_query(sql, params) rows = db_query(sql, params)
return {"count": len(rows), "results": rows} return {"count": len(rows), "results": rows}
@router.post("/dokumenti/upload")
async def upload_dokument(
file: UploadFile = File(...),
title: str = Form(...),
vrsta: str = Form("ostalo"),
razina: Optional[str] = Form(None),
organizacija: Optional[str] = Form(None),
sport: Optional[str] = Form(None),
izvor_url: Optional[str] = Form(None),
godina: Optional[int] = Form(None),
kratak_opis: Optional[str] = Form(None),
authorization: Optional[str] = Header(None),
):
"""Upload novog dokumenta (PDF/DOCX/TXT) → spremi datoteku + DB row.
Vraća: {ok, dokument_id, fname, size, content_type}.
Tekstualni sadržaj se ekstrahira (pdftotext za PDF, raw za TXT)."""
import pathlib, subprocess as _sp, tempfile as _tf, hashlib as _hl
raw = await file.read()
if not raw:
raise HTTPException(400, "Prazna datoteka")
if len(raw) > 32 * 1024 * 1024:
raise HTTPException(400, "Datoteka prevelika (max 32 MB)")
suf = ("." + (file.filename or "").rsplit(".", 1)[-1].lower()) if "." in (file.filename or "") else ""
if suf not in (".pdf", ".doc", ".docx", ".txt", ".rtf"):
raise HTTPException(400, f"Tip nije podržan: {suf}. Dozvoljeno: PDF/DOC/DOCX/TXT/RTF")
out_dir = pathlib.Path("/opt/pgz-sport/_data/dokumenti_uploads")
out_dir.mkdir(parents=True, exist_ok=True)
sha = _hl.sha256(raw).hexdigest()[:12]
safe = re.sub(r"[^A-Za-z0-9._-]+", "_", file.filename or "upload")[:120]
fname = f"{int(time.time())}_{sha}_{safe}"
if not fname.endswith(suf): fname += suf
fpath = out_dir / fname
fpath.write_bytes(raw)
# Tekst ekstrakcija (best-effort)
sadrzaj = ""
try:
if suf == ".pdf":
r = _sp.run(["pdftotext", "-layout", "-q", str(fpath), "-"],
capture_output=True, timeout=60)
sadrzaj = r.stdout.decode("utf-8", "ignore")
elif suf == ".txt":
sadrzaj = raw.decode("utf-8", "ignore")
# docx/rtf: best-effort, skip
except Exception:
pass
row = db_one("""
INSERT INTO pgz_sport.dokumenti
(title, kratak_opis, vrsta, razina, organizacija, sport, izvor_url,
godina, fname, sadrzaj, scraped_at, aktivan)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),true)
RETURNING id, title, vrsta, razina, fname, length(sadrzaj) AS chars
""", (title, kratak_opis, vrsta, razina, organizacija, sport, izvor_url,
godina, fname, sadrzaj or None))
# Audit
try:
from erp.audit_helper import audit as _audit
_audit("pgz_sport.dokumenti", "upload", row["id"],
korisnik="api", field="title",
new=f"{title} ({len(raw)} B, sha={sha})")
except Exception: pass
return {"ok": True, "dokument_id": row["id"], "fname": fname,
"title": row["title"], "vrsta": row["vrsta"], "chars": row["chars"],
"size": len(raw), "content_type": file.content_type,
"sha12": sha}
@router.get("/dokumenti/by-razina") @router.get("/dokumenti/by-razina")
def dokumenti_grouped(): def dokumenti_grouped():
"""Group po razini i vrsti — for dashboard.""" """Group po razini i vrsti — for dashboard."""
@@ -2251,9 +2323,10 @@ def dokumenti_grouped():
GROUP BY razina, vrsta ORDER BY razina, vrsta""") GROUP BY razina, vrsta ORDER BY razina, vrsta""")
return {"count": len(rows), "results": rows} return {"count": len(rows), "results": rows}
@router.get("/dokumenti/{did:int}") @router.get("/dokumenti/{did:int}/full")
def get_dokument(did: int): def get_dokument_full(did: int):
"""Full dokument view with content.""" """Full dokument view + RAG chunks (renamed from duplicate /dokumenti/{did:int}).
Old route bila duplikat — sad je eksplicitno /full za bogatiji prikaz."""
d = db_one("""SELECT id, title AS naziv, kratak_opis, sadrzaj, vrsta, razina, d = db_one("""SELECT id, title AS naziv, kratak_opis, sadrzaj, vrsta, razina,
organizacija, sport, sluzbeni_glasnik, izvor_url, pdf_url, organizacija, sport, sluzbeni_glasnik, izvor_url, pdf_url,
kljucne_rijeci, izdano_datum, godina kljucne_rijeci, izdano_datum, godina