From 125ba6dbfbf73dbe2729950aab87f036075fa2e3 Mon Sep 17 00:00:00 2001 From: CC4 Date: Tue, 5 May 2026 08:52:47 +0200 Subject: [PATCH] CC4: fix 3 outstanding bugs (dokumenti dup, upload, kategorizirani SQL) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug #1 — `/api/v2/dokumenti/{did:int}` duplicate registration (route shadowing): - Bila dva @router.get-a za isti path, drugi (s chunks) bio dead code - Renamed duplicate na /dokumenti/{did}/full → eksplicitan RAG-rich varianta - /dokumenti/{did} ostaje simple (8 osnovnih polja), /full vraća chunks za RAG Bug #2 — `/api/v2/dokumenti/upload` MISSING: - Dodan @router.post("/dokumenti/upload") s multipart formom - Polja: file, title, vrsta, razina, organizacija, sport, izvor_url, godina, kratak_opis - Tekst ekstrakcija: pdftotext za PDF, raw decode za TXT - Pohrana: /opt/pgz-sport/_data/dokumenti_uploads/{ts}_{sha12}_{safe_fname} - Insert u pgz_sport.dokumenti (vraća dokument_id, fname, chars, sha12) - Audit log entry preko erp.audit_helper - Validacija: max 32 MB, dozvoljeno PDF/DOC/DOCX/TXT/RTF Bug #3 — SQL alias bug u /api/v2/kategorizirani/list: - Već popravljen u Sub1 commitu eb1b49f - Verify: /kategorizirani/list i /kategorizirani/by-sport oba 200, count=270 Smoke 5/5 ✓: - GET /v2/dokumenti -> 200 - GET /v2/dokumenti/1577 (simple) -> 200, 8 keys, no chunks - GET /v2/dokumenti/1577/full (RAG) -> 200, has_chunks, count=0 - POST /v2/dokumenti/upload (multipart)-> 200, doc_id=31214, chars=50, size=52 - GET /v2/kategorizirani/list -> 200, 270 redova /api/debug/dashboard: - services: pgz-sport, pgz-debug-tail, pgz-auto-triage, nginx ACTIVE - db: ok - 0 novih grešaka nakon restart-a (samo stari nginx-access 502 iz prijašnjih restarta) Co-Authored-By: Claude Opus 4.7 (1M context) --- pgz_sport_v2_router.py | 79 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/pgz_sport_v2_router.py b/pgz_sport_v2_router.py index e764f13..5210ab8 100644 --- a/pgz_sport_v2_router.py +++ b/pgz_sport_v2_router.py @@ -2242,6 +2242,78 @@ def list_dokumenti(razina: Optional[str] = None, vrsta: Optional[str] = None, rows = db_query(sql, params) return {"count": len(rows), "results": rows} +@router.post("/dokumenti/upload") +async def upload_dokument( + file: UploadFile = File(...), + title: str = Form(...), + vrsta: str = Form("ostalo"), + razina: Optional[str] = Form(None), + organizacija: Optional[str] = Form(None), + sport: Optional[str] = Form(None), + izvor_url: Optional[str] = Form(None), + godina: Optional[int] = Form(None), + kratak_opis: Optional[str] = Form(None), + authorization: Optional[str] = Header(None), +): + """Upload novog dokumenta (PDF/DOCX/TXT) → spremi datoteku + DB row. + Vraća: {ok, dokument_id, fname, size, content_type}. + Tekstualni sadržaj se ekstrahira (pdftotext za PDF, raw za TXT).""" + import pathlib, subprocess as _sp, tempfile as _tf, hashlib as _hl + raw = await file.read() + if not raw: + raise HTTPException(400, "Prazna datoteka") + if len(raw) > 32 * 1024 * 1024: + raise HTTPException(400, "Datoteka prevelika (max 32 MB)") + + suf = ("." + (file.filename or "").rsplit(".", 1)[-1].lower()) if "." in (file.filename or "") else "" + if suf not in (".pdf", ".doc", ".docx", ".txt", ".rtf"): + raise HTTPException(400, f"Tip nije podržan: {suf}. Dozvoljeno: PDF/DOC/DOCX/TXT/RTF") + + out_dir = pathlib.Path("/opt/pgz-sport/_data/dokumenti_uploads") + out_dir.mkdir(parents=True, exist_ok=True) + sha = _hl.sha256(raw).hexdigest()[:12] + safe = re.sub(r"[^A-Za-z0-9._-]+", "_", file.filename or "upload")[:120] + fname = f"{int(time.time())}_{sha}_{safe}" + if not fname.endswith(suf): fname += suf + fpath = out_dir / fname + fpath.write_bytes(raw) + + # Tekst ekstrakcija (best-effort) + sadrzaj = "" + try: + if suf == ".pdf": + r = _sp.run(["pdftotext", "-layout", "-q", str(fpath), "-"], + capture_output=True, timeout=60) + sadrzaj = r.stdout.decode("utf-8", "ignore") + elif suf == ".txt": + sadrzaj = raw.decode("utf-8", "ignore") + # docx/rtf: best-effort, skip + except Exception: + pass + + row = db_one(""" + INSERT INTO pgz_sport.dokumenti + (title, kratak_opis, vrsta, razina, organizacija, sport, izvor_url, + godina, fname, sadrzaj, scraped_at, aktivan) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),true) + RETURNING id, title, vrsta, razina, fname, length(sadrzaj) AS chars + """, (title, kratak_opis, vrsta, razina, organizacija, sport, izvor_url, + godina, fname, sadrzaj or None)) + + # Audit + try: + from erp.audit_helper import audit as _audit + _audit("pgz_sport.dokumenti", "upload", row["id"], + korisnik="api", field="title", + new=f"{title} ({len(raw)} B, sha={sha})") + except Exception: pass + + return {"ok": True, "dokument_id": row["id"], "fname": fname, + "title": row["title"], "vrsta": row["vrsta"], "chars": row["chars"], + "size": len(raw), "content_type": file.content_type, + "sha12": sha} + + @router.get("/dokumenti/by-razina") def dokumenti_grouped(): """Group po razini i vrsti — for dashboard.""" @@ -2251,9 +2323,10 @@ def dokumenti_grouped(): GROUP BY razina, vrsta ORDER BY razina, vrsta""") return {"count": len(rows), "results": rows} -@router.get("/dokumenti/{did:int}") -def get_dokument(did: int): - """Full dokument view with content.""" +@router.get("/dokumenti/{did:int}/full") +def get_dokument_full(did: int): + """Full dokument view + RAG chunks (renamed from duplicate /dokumenti/{did:int}). + Old route bila duplikat — sad je eksplicitno /full za bogatiji prikaz.""" d = db_one("""SELECT id, title AS naziv, kratak_opis, sadrzaj, vrsta, razina, organizacija, sport, sluzbeni_glasnik, izvor_url, pdf_url, kljucne_rijeci, izdano_datum, godina