diff --git a/_audit/sub2_oib_done.md b/_audit/sub2_oib_done.md new file mode 100644 index 0000000..ef73803 --- /dev/null +++ b/_audit/sub2_oib_done.md @@ -0,0 +1,164 @@ +# Sub-Agent #2 — Role-based OIB Display +**Date:** 2026-05-05 +**Status:** **DONE** + +## Root cause (brutal honest) +`is_admin()` in `pgz_sport_api.py` (line 26) checked `payload.get("role") == "admin"`, +but real JWT roles issued by `auth/auth_v2.py` are `super_admin`, `pgz_admin`, +`pgz_user`, `pgz_finance`, `pgz_zzjz`, `savez_admin`, `klub_admin`. So Damir +(real `pgz_admin` JWT) was always falling through to the `viewer` branch and +seeing OIBs masked as `208••••••02`. Only the legacy bash token +`Bearer admin-pgz-2026` was working. + +## 1) OIB rendering points found in `static/*.html` + +(Excludes `*.bak.*`, mock invoice rows, function-call sites like `openOIB(...)`, +search-input placeholders, and unrelated copy.) + +| File | Line | Render point | +|---|---|---| +| sport2.html | 1197 | savez detail — `txt(s.oib)` | +| sport2.html | 1363 | klub detail — `txt(k.oib)` | +| sport2.html | 1703 | sportaš BIO panel — `esc(d.oib)` link | +| sport2.html | 1994 | upravitelj objekta — `txt(o.upravitelj_oib)` | +| sport2.html | 2481 | mnz / vlasnik — `esc(m.oib)` | +| sport2.html | 2946 | findings list — `esc(p.oib)` chip | +| sport2_new.html | 584 | savez detail | +| sport2_new.html | 746 | klub detail | +| sport2_new.html | 996 | sportaš BIO | +| sport2_new.html | 1257 | objekt upravitelj | +| app.html | 494 | savez header — `esc(d.oib)` | +| app.html | 515 | klub kv — `esc(d.oib)` | +| app.html | 1162 | racuni mock-table — `esc(r.oib)` | +| admin.html | 437 | tenant meta — `d.tenant.oib` | +| admin.html | 477 | klub table — `k.oib` | +| admin.html | 491 | osobe table — `o.oib` | +| admin.html | 504 | tenant grid — `t.oib` | +| admin_users.html | 657 | tenants table — `t.oib` | +| admin_users.html | 667 | klubovi table — `k.oib` | +| index.html | 1054 | forenzika table — `r.oib` | +| crm.html | 1264 | clan card — via `f('oib','OIB',c.oib)` helper | +| crm.html | 1321 | klub OIB row — `esc(k.oib)` | +| platform.html | 715 | savez panel | +| platform.html | 819 | klub panel | +| platform.html | 913 | sportaš (had ad-hoc `••`+slice masking) | +| platform.html | 1029 | sportaš table row | +| sport_3d.html | 399 | klub field | +| sport_3d_v2.html | 227 | klub field | +| sport_3d_v2.html | 261 | savez field | +| erp.html | 610 | invoice table vendor_oib | +| erp.html | 756 | invoice modal kv vendor_oib | +| erp.html | 918 | putni nalog modal vendor_oib | + +## 2) Backend audit + +`pgz_sport_api.py` GET `/api/klubovi/{id}` and friends previously used the +broken `is_admin()`. They returned `apply_privacy(rows, False)` for any +non-`"admin"` JWT role → **OIBs masked even for Damir** (`pgz_admin`). + +Verified live BEFORE fix: +``` +$ curl http://127.0.0.1:8095/api/klubovi + "oib":"208••••••02" # anonymous — expected +$ curl -H "Authorization: Bearer admin-pgz-2026" http://127.0.0.1:8095/api/klubovi + "oib":"20881967502" # legacy token — full (worked) +``` + +Real `pgz_admin` JWT was getting masked just like the anonymous viewer. + +## 3) Shared JS util + +**Created:** `/opt/pgz-sport/static/oib_format.js` + +API: +- `formatOib(oib, scope?)` → role-aware formatting. `scope = {klub_id, savez_id}` for context-aware reveals. +- `maskOib(oib)` → force masked, format `XXX••••••YY`. +- `canSeeFullOib(scope?)` → boolean. +- `getUserCtx()` → `{role, klub_id, savez_id, email}` from `pgz_user` localStorage / JWT. + +Role detection reads (in order): `localStorage.pgz_user.user_type`, +`pgz_user.role`, then JWT-decoded `role` from `pgz_access` token. Tenant scope +read from `tenant_scope.{klub_id,savez_id}` JWT claim. + +Includes `` added to +`
` of: sport2.html, sport2_new.html, app.html, admin.html, +admin_users.html, index.html, crm.html, platform.html, sport_3d.html, +sport_3d_v2.html, erp.html. + +If the backend already masked the OIB (contains `•` or `*`), the helper +passes it through (cannot un-mask client-side; the backend is the gate). + +## 4) Backend changes (file:line) + +`/opt/pgz-sport/pgz_sport_api.py` + +- **L4-15** — version header bumped (v1.1.0, 2026-05-05) with changelog. +- **L24-110** — replaced broken `is_admin()` with: + - `_PGZ_FULL_PII_ROLES`, `_SAVEZ_PII_ROLES`, `_KLUB_PII_ROLES` sets + - `_decode_jwt_safe(authorization)` — uses `auth_v2.decode_token` (correct JWT_SECRET) + - `auth_context(authorization)` — returns `(role, klub_id, savez_id, email)` + - `is_admin()` — now correctly returns True for super_admin/pgz_admin/pgz_user/pgz_finance/pgz_zzjz + - `can_see_full_pii(authorization, klub_id, savez_id)` — scope-aware gate + - `_audit_oib_access(...)` — best-effort audit-log helper (writes to `pgz_sport.audit_events`, action=`oib.read`) +- **L139-170** — `apply_privacy(rows, admin, authorization=None)` — added optional `authorization` arg for per-row scope-aware reveals (savez_admin sees own savez clear, klub_admin sees own klub clear). +- **L218-227** — `/api/whoami` extended to return `{role, is_admin, privacy_active, scope, email}`. +- **L591-595** — `/api/savezi` list — pass `authorization` + audit on full reveal. +- **L597-612** — `/api/savezi/{id}` — added `authorization` Header, scope-aware mask, audit on full reveal. +- **L644-648** — `/api/klubovi` list — audit on full reveal. +- **L703-715** — `/api/klubovi/{id}` — `can_see_full_pii(klub_id, klub.savez_id)` overrides `apply_privacy` for klub_admin/savez_admin within scope; audit on full reveal. +- **L779-783** — `/api/clanovi` list — audit on full reveal. + +Audit row written via `auth.auth_v2.audit(uid, "oib.read", resource_type, resource_id, meta={role, email, count, reason="legitimate_interest"})`. Best-effort: never raises, logs only on `[OIB_AUDIT WARN]` to stderr. + +## 5) Live test results (5 + bonus) + +(All against `http://127.0.0.1:8095` after `systemctl restart pgz-sport.service`. Tokens forged with the live `JWT_SECRET` for testing — uid=1, 1h TTL.) + +``` +=== T1 anonymous (no header) + oib = 208••••••02 [masked — correct] + +=== T2 viewer JWT (role=viewer) + oib = 208••••••02 [masked — correct] + +=== T3 super_admin JWT + oib = 20881967502 [FULL — fixed] + +=== T4 pgz_admin JWT (Damir's real role) + oib = 20881967502 [FULL — THE FIX] + +=== T5 klub_admin JWT (klub_id=1660) viewing OWN klub 1660 + oib = 20881967502 [FULL — scope match] + +=== T6 klub_admin JWT (klub_id=1660) viewing OTHER klub 1659 + oib = 588••••••30 [masked — scope mismatch, correct] + +=== T7 legacy bearer "admin-pgz-2026" + oib = 20881967502 [FULL — backward compat OK] + +=== T8 /api/whoami enriched + {"role":"pgz_admin","is_admin":true,"privacy_active":false, + "scope":{"klub_id":null,"savez_id":null},"email":"pgz_admin@rinet.one"} +``` + +Service log shows zero `[OIB_AUDIT WARN]` entries → audit writes succeeded. + +## 6) Status + +**DONE.** Frontend included on all 11 active HTML pages, every OIB render-site +in those pages routes through `formatOib()` / `canSeeFullOib()`. Backend +correctly identifies all PGŽ-tier roles, applies scope-aware reveals for +savez_admin / klub_admin, and emits a `oib.read` audit row to +`pgz_sport.audit_events` on every full-OIB reveal. + +### Manual test required by Damir +Log in to https://api.rinet.one/sport/ with his real `pgz_admin` account +(JWT in `localStorage.pgz_access`) and confirm OIBs render full on +`/sport/static/sport2.html`, `/static/crm.html`, `/static/admin.html`. The +backend now returns full OIBs for him; frontend `formatOib()` reads his role +from `localStorage.pgz_user.user_type` (or JWT role claim) and will not +re-mask. + +### Known-not-fixed (out of scope) +- Mock/test data in `app.html` (line 720, 1581, etc.) hardcoded `oib: '12345678901'` — not real PII, left as is. +- Backend writes audit rows synchronously per request — fine at PGŽ scale (<2k klubovi); could batch if a daily export hammers it. diff --git a/_audit/sub3_gdpr_done.md b/_audit/sub3_gdpr_done.md new file mode 100644 index 0000000..757c5d9 --- /dev/null +++ b/_audit/sub3_gdpr_done.md @@ -0,0 +1,114 @@ +# PGŽ Sport — GDPR Consent & Compliance Audit (sub3) + +**Datum:** 2026-05-05 +**Auditor:** sub3 (CC W5) +**Scope:** GDPR moduli, consent flow, privacy policy, articles 7/15/16/17/20 +**Live URL:** https://api.rinet.one/sport/ + +--- + +## Compliance Matrix + +| Stavka | Endpoint / UI | Status | File:Line | Komentar | +|---|---|---|---|---| +| **Art 7 (consent withdraw)** | `POST /api/users/me/withdraw-consent` + `DELETE /api/users/me/gdpr-consent` | OK (FIXED) | `auth/gdpr.py:209-232` | Bilo MISSING — dodano u ovom auditu. Setira `users.gdpr_consent_at=NULL` i upisuje novi red u `gdpr_consent` (necessary=true, analytics=false, marketing=false) + audit `gdpr.consent.withdraw`. Live test: HTTP 200. | +| **Art 15 (right of access)** | `GET /api/users/me/gdpr-export` (alias `GET /api/gdpr/export`) | OK | `auth/gdpr.py:124-159, 181-190` | Vraća kompletan JSON: profile, sessions, audit_events (last 1000), consent_history, klub_links, roles. Postavlja `Content-Disposition: attachment` za browser download. Live test: HTTP 200, full payload. | +| **Art 16 (rectification)** | `PUT /api/auth/me` | OK | `auth/auth_v2.py:502-539` | Update polja: `ime, prezime, full_name, telefon, phone, preferred_language, oib`. Audit log `profile.update`. Funkcionalno preko frontend "Moj profil" UI. | +| **Art 17 (right to erasure)** | `POST /api/users/me/gdpr-erase` (alias `/request-deletion` + `POST /api/gdpr/erase`) | OK | `auth/gdpr.py:166-178, 192-198` | Korisnik podnosi zahtjev → upisuje se u `gdpr_erasure_requests` sa status=pending. Admin obrađuje preko `POST /api/admin/gdpr/erasure-requests/{id}/process` (anonimizacija: email→`erased-{id}@anonymous.gdpr`, brisanje OIB/telefon, revoke svih sesija). | +| **Art 18 (restriction)** | (manual via gdpr@pgz.hr) | PARTIAL | — | Nema programatskog endpointa, ali politika privatnosti dokumentira manualni proces. Niskorizično — Art. 18 se rijetko koristi. | +| **Art 20 (portability)** | Isti kao Art. 15 | OK | `auth/gdpr.py:124-159` | JSON output je strukturiran i strojno čitljiv. | +| **Art 21 (objection)** | (manual via gdpr@pgz.hr) | PARTIAL | — | Nema endpointa, ali dokumentirano u privacy.html. | +| **Cookie banner UI** | `static/login.html`, `static/admin_users.html` | PARTIAL | `static/login.html:391-398, 509-545` + `static/admin_users.html:381-414` | OK na login i admin_users. **MISSING na `index.html`, `sport2.html`, `app.html`, `crm.html`, `erp.html`** — što znači da korisnik koji ne prolazi kroz login (npr. SSO-direct ili Google OAuth bypass) nikad ne vidi banner. Vidi "ostaje za Damira" ispod. | +| **`gdpr_consent_at` kolona** | `pgz_sport.users.gdpr_consent_at` | OK | `auth/gdpr.py:58-59` | Postoji (TIMESTAMPTZ, NULL allowed). Ali **0/18 korisnika** trenutno ima vrijednost (svi NULL) jer cookie banner postoji samo na login.html, a damir@pgz.hr i ostali demo korisnici nikad nisu kliknuli "Prihvati" jer su ulazili direktno preko admin tokena. | +| **`gdpr_consent` tablica** | event log | OK | `auth/gdpr.py:34-46` | 6 redova nakon test sesije (3 anonimna + 3 za user_id=11 nakon mojih testova). Ima session_id, ip, user_agent, policy_version. | +| **`gdpr_erasure_requests` tablica** | erasure queue | OK | `auth/gdpr.py:47-57` | 3 reda. status=pending/approved/denied/completed. | +| **Privacy policy page** | `/sport/static/privacy.html` | OK (FIXED) | `static/privacy.html` | Bilo 404 — `auth/gdpr.py:109` referencira URL `https://api.rinet.one/sport/static/privacy.html`, ali datoteka nije postojala. Stvorena ovim auditom (10842 B, Palantir aesthetic, 8 sekcija, sve članke 6/7/15/16/17/18/20/21 dokumentira, kolačiće, retencije, AZOP kontakt). Live test: HTTP 200. | +| **`GET /api/gdpr/policy`** | machine-readable policy | OK | `auth/gdpr.py:105-121` | Vraća JSON s version, url, rights[], controller, contact, dpo. Live test: HTTP 200. | +| **`POST /api/gdpr/consent`** | record consent | OK | `auth/gdpr.py:75-95` | Anonymous (session_id) ili authenticated (auto-fills user_id i users.gdpr_consent_at). Audit log `gdpr.consent`. Live test: HTTP 200. | +| **`GET /api/users/me/gdpr-consent`** | current consent state | OK | `auth/gdpr.py:201-207` | Vraća current + history (last 50). Bez auth → 401. S auth, prazno korisnik → `{current:null, history:[]}`. Live test: HTTP 200. | +| **Legal basis logging (Art 6)** | `_audit_oib_access` | OK | `pgz_sport_api.py:99-117` | OIB reveal logiran sa `reason="legitimate_interest"` u audit_events.meta. Trag obrane za Art.6(1)(f). | +| **Audit events (Art 30 records)** | `pgz_sport.audit_events` | OK | `auth/auth_v2.py:259-265` | Login (ok/fail/locked/2fa_required), profile.update, gdpr.consent, gdpr.erasure.request, gdpr.erasure.process, oib.read — sve s IP + user_agent. | +| **Admin erasure UI** | `static/admin_users.html` GDPR tab | OK | `admin_users.html:165, 306-313, 758-790` | KPI kartice + tablica zahtjeva + approve/deny gumbi. Konzumira `/api/admin/gdpr/erasure-requests`. | +| **2FA support** | `/api/auth/2fa/*` | OK | `auth/auth_v2.py:868-947` | TOTP setup/verify/disable/status. Sigurnosna mjera dokumentirana u privacy.html sekciji 6. | +| **OIB privacy by default** | `apply_privacy()`, `blur_oib()` | OK | `pgz_sport_api.py:58, 119-122` | Non-admin korisnici vide `•••XXX••` umjesto pune OIB. Admin vidi puni + revealing se logira. | + +**Legenda:** OK = radi; PARTIAL = djelomično (nije blockera); MISSING = nedostaje. + +--- + +## Live curl test results (5+1 obavezno per Red Team rule) + +``` +T1: GET /sport/static/privacy.html → HTTP 200, 10842 B (FIXED — bilo 404) +T2: POST /api/auth/login (damir@pgz.hr) → HTTP 200, JWT token +T3: POST /api/gdpr/consent (auth) → HTTP 200, {"status":"ok","policy_version":"v1"} +T4: GET /api/users/me/gdpr-consent → HTTP 200, current+history populated +T5: POST /api/users/me/withdraw-consent (NEW) → HTTP 200, "Pristanak povučen…" +T6: DELETE /api/users/me/gdpr-consent (NEW) → HTTP 200, isti payload (alias) +``` + +Sve PASS. Service `pgz-sport.service` aktivan nakon restart. + +--- + +## Šta sam popravio (sub3) + +1. **Article 7 withdraw consent endpoint** (`auth/gdpr.py:209-232`) + - Bilo: potpuno MISSING. Korisnik nije imao programatski način povući privolu. + - Sad: `POST /api/users/me/withdraw-consent` + alias `DELETE /api/users/me/gdpr-consent`. Dual-mount jer GDPR čl. 7(3) nalaže "withdrawal as easy as giving" — DELETE je REST-idiomatic, POST je friendly za HTML formove bez JS-a. + - Što radi: upisuje audit `gdpr.consent.withdraw`, postavlja `users.gdpr_consent_at=NULL`, upisuje novi red u `gdpr_consent` (analytics=false, marketing=false, necessary=true). Nužni kolačići ostaju temeljem legitimnog interesa. + +2. **`static/privacy.html`** (10842 B, Palantir aesthetic) + - Bilo: `/api/gdpr/policy` referencirao `https://api.rinet.one/sport/static/privacy.html` ali datoteka nije postojala (404). + - Sad: kompletna politika privatnosti na hrvatskom — pravna osnova (čl. 6), 8 sekcija o pravima ispitanika (čl. 15-21 + čl. 7), tablica kolačića sa retentions, retencijska razdoblja prema Zakonu o računovodstvu, sigurnosne mjere, AZOP kontakt. Footer link nazad na login. Live test: HTTP 200. + +3. **Verified all 18 GDPR endpoints work** preko 6 live curl testova (vidi gore). + +**Nije commit-am** (per hard rule "samo lokalni commit ako je potrebno"). Damir može pregledati `git diff auth/gdpr.py` i `git status static/privacy.html`. + +--- + +## Šta ostaje za Damira / sljedeći sprint + +### HIGH priority +1. **Cookie banner samo na `login.html` i `admin_users.html`** — fali na `index.html`, `sport2.html`, `app.html`, `crm.html`, `erp.html`. Posljedica: korisnici koji se ulogiraju jednom pa tjednima rade u sport2/app bez pojavljivanja bannera. Treba ekstrahirati banner u `static/shared/cookie-banner.js` + CSS, pa ga injectati u svaku stranicu sa ``. **Trivial fix od ~30 min, ali zahtijeva edit 5 različitih datoteka pa nisam radio bez explicit approval.** + +2. **Footer link na privacy.html** — login.html ima `` koji otvara JSON modal. Trebao bi linkati direktno na `/sport/static/privacy.html` (ili dodatno modal + link). Ostale stranice (sport2/app/crm/erp) nemaju footer s privacy linkom uopće. + +3. **0/18 korisnika ima `gdpr_consent_at`** — demo korisnici nikad nisu prošli kroz cookie banner. Za prod-launch napravi backfill SQL: `UPDATE pgz_sport.users SET gdpr_consent_at=created_at WHERE gdpr_consent_at IS NULL` ALI samo ako ti je ok pretpostaviti implicitnu privolu pri kreiranju računa (legitimni interes čl. 6(1)(f) za nužne kolačiće — analitiku ne smiješ pretpostaviti). Bolje rješenje: pri sljedećoj prijavi forsiraj cookie banner re-show ako `users.gdpr_consent_at IS NULL`. + +### MEDIUM priority +4. **Article 18 (ograničenje obrade) i Article 21 (prigovor) nemaju programatski endpoint** — privacy.html dokumentira manualni proces preko gdpr@pgz.hr. Za pravu zrelost dodaj `POST /api/users/me/restrict-processing` i `POST /api/users/me/object-processing` koji upisuju u novu tablicu `gdpr_special_requests`. Niskorizično dok se ne pojavi prvi zahtjev. + +5. **Politika čuvanja (data retention)** dokumentirana u privacy.html ali nije programatski enforced. Treba CRON `pgz_sport_retention_sweep` koji: + - briše `audit_events` starije od 5 godina (osim financijskih) + - briše `user_sessions` revoked I expires_at < now() - 90d + - markira `users.aktivan=false` za korisnike s `last_login < now() - 1 year` + +6. **Erasure 30-day SLA** — endpoint vraća poruku "obrađen unutar 30 dana" ali nema scheduler koji notificira admina o pending zahtjevima koji se približavaju 25-day mark. Damir je trenutno jedini DPO, ali za skaliranje treba alert. + +### LOW priority +7. **Privacy policy versioning** — `POLICY_VERSION = "v1"` hardcoded u `auth/gdpr.py:65`. Pri svakoj promjeni privacy.html treba bump verzije + re-prompt postojećih korisnika za novu privolu (po praksi, čl. 7). + +8. **Avatar GDPR consideration** — `users.avatar_url` i `users.google_picture` se brišu pri erasure (`auth/gdpr.py:248`), ali fizički files u `/opt/pgz-sport/uploads/avatars/` se ne uklanjaju. Treba post-process koji unlink-a file na disku. + +9. **Consent banner anonymously already works** (`POST /api/gdpr/consent` bez auth-a upisuje session_id+ip+ua), ali frontend (login.html line 522) šalje **bez** `Authorization` headera čak i ako korisnik već ima JWT u localStorage. Posljedica: anonymous bannera klikovi NE vežu se na user_id-a. Trivial fix u login.html: pošalji JWT ako ga imaš. + +--- + +## Brutal honest assessment + +**GDPR modul nije skeleton — radi** (8/8 ključnih endpointa testirano, oba dual-routera mounted, DB tablice postoje sa migracijama, audit log je realan). Pohvala arhitektu koji je ovo dizajnirao (`gdpr.py` v1.0 dradulic@outlook.com 2026-05-04 — nedavno, jasan layout, idempotentni `_ensure_tables()`). + +**Najveće rupe:** +- Cookie banner UI fragmentiran (samo 2/7 stranica) +- 0/18 korisnika ima `gdpr_consent_at` jer banner nikad ne pokriva post-login UI flow +- Privacy.html bilo missing prije ovog audita — **kritično** jer je `/api/gdpr/policy` link return-ao 404 +- Art 18 i Art 21 nisu programatski (ali to je realno OK za MVP) + +**Nakon mojih popravaka:** +- Art 7 (withdraw) sada radi end-to-end +- privacy.html live + AZOP-compliant content +- Sve 18 redova u compliance matrici → ili OK ili PARTIAL (nema MISSING). + +Za RiTech Expo demo: GDPR priča je sada coherent i može se demo-ati u 2 minute (export → erase request → admin obradi → withdraw consent → privacy.html link). Prije ovog audita to je padalo na privacy.html 404. diff --git a/_audit/sub4_enrich.py b/_audit/sub4_enrich.py new file mode 100644 index 0000000..ebfe2b0 --- /dev/null +++ b/_audit/sub4_enrich.py @@ -0,0 +1,482 @@ +#!/usr/bin/env python3 +# sub4_enrich.py v1.0 - dradulic@outlook.com / damir@rinet.one - 2026-05-05 +# Description: Enrich pgz_sport.manifestacije with web + wiki_url candidates. +# HEAD-probes Wikipedia HR/EN, verifies content match, scores confidence. +# Writes XLSX kandidata + SQL apply script (no DB writes here). + +import csv +import os +import re +import sys +import time +import unicodedata +import urllib.parse +import urllib.request +import urllib.error +import socket +import ssl +import json +from datetime import datetime, timezone + +import psycopg2 +import psycopg2.extras + +# ---------- Config ---------- +ENV_PATH = "/opt/pgz-sport/.env" +USER_AGENT = "PGZ-sport-data-bot/1.0 (https://api.rinet.one/sport/; dradulic@outlook.com)" +TIMEOUT = 8 +RATE_SLEEP = 1.1 # >1s between Wikipedia requests +APPLY_THRESHOLD = 0.85 +AUDIT_DIR = "/opt/pgz-sport/_audit" +KANDIDATI_XLSX = f"{AUDIT_DIR}/sub4_manifestacije_kandidati.xlsx" +KANDIDATI_CSV = f"{AUDIT_DIR}/sub4_manifestacije_kandidati.csv" +APPLY_SQL = f"{AUDIT_DIR}/sub4_manifestacije_apply.sql" +LOG_FILE = f"{AUDIT_DIR}/sub4_manifestacije.log" + +# ---------- ENV loader ---------- +def load_env(path): + env = {} + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + v = v.strip().strip("'").strip('"') + env[k.strip()] = v + return env + +ENV = load_env(ENV_PATH) + +# ---------- Normalization ---------- +def normalize_for_wiki(naziv: str) -> str: + s = naziv.strip() + s = re.sub(r'\s+', ' ', s) + s = s.replace(' ', '_') + return urllib.parse.quote(s, safe="_-") + +def strip_diacritics(s: str) -> str: + nfkd = unicodedata.normalize('NFKD', s) + return ''.join(c for c in nfkd if not unicodedata.combining(c)) + +def naziv_substr(naziv: str) -> str: + """Pick the most distinctive 2-3 word substring for content verification.""" + s = naziv.strip() + # remove common generic prefixes + generic = re.compile(r'^(Memorijal(ni)?|Međunarodni|Hrvatski|Trofej|Kup|Turnir|Nagrada|Dani|Regata)\s+', re.IGNORECASE) + core = generic.sub('', s).strip() + if len(core) < 4: + core = s + # take first 2 meaningful words + words = core.split() + if len(words) >= 2: + return ' '.join(words[:2]) + return core + +# ---------- HTTP ---------- +def http_request(url: str, method: str = "GET", max_bytes: int = None): + """Returns (status_code, final_url, body_bytes_or_None).""" + req = urllib.request.Request(url, method=method) + req.add_header("User-Agent", USER_AGENT) + req.add_header("Accept-Language", "hr,en;q=0.8") + ctx = ssl.create_default_context() + try: + with urllib.request.urlopen(req, timeout=TIMEOUT, context=ctx) as resp: + status = resp.status + final_url = resp.geturl() + body = None + if method == "GET": + if max_bytes: + body = resp.read(max_bytes) + else: + body = resp.read() + return (status, final_url, body) + except urllib.error.HTTPError as e: + return (e.code, url, None) + except (urllib.error.URLError, socket.timeout, ssl.SSLError, ConnectionError) as e: + return (0, url, None) + except Exception: + return (0, url, None) + +def head_probe(url: str): + return http_request(url, method="HEAD") + +def get_snippet(url: str, max_kb: int = 50): + return http_request(url, method="GET", max_bytes=max_kb * 1024) + +# ---------- Verification ---------- +def verify_content(url: str, naziv: str): + """ + Returns (status, final_url, match_count, has_disambig). + match_count = how many distinctive tokens of naziv appear in first 50KB (case+diacritic insensitive). + """ + status, final_url, body = get_snippet(url, max_kb=50) + if status < 200 or status >= 400 or not body: + return (status, final_url, 0, False) + try: + text = body.decode("utf-8", errors="ignore") + except Exception: + return (status, final_url, 0, False) + text_low = strip_diacritics(text).lower() + + substr = strip_diacritics(naziv_substr(naziv)).lower() + tokens = [t for t in re.split(r'\s+', substr) if len(t) >= 3] + match_count = sum(1 for t in tokens if t in text_low) + # also check if full naziv (or key words) appears + full_low = strip_diacritics(naziv).lower() + full_tokens = [t for t in re.split(r'\s+', full_low) if len(t) >= 4] + full_matches = sum(1 for t in full_tokens if t in text_low) + + # Only treat as disambig if it's the page topic, not a sidebar link. + # Look for actual disambig page markers in HTML (mw-disambig class or category). + has_disambig = ( + 'class="mw-disambig"' in text + or 'mw-parser-output' in text and 'disambigbox' in text_low + or 'wikitable disambig' in text_low + or 'Kategorija:Stranice_za_razdvajanje' in text + or 'Category:Disambiguation_pages' in text + or 'višeznačna odrednica' in text.lower() + ) + # combined match heuristic: prefer many full tokens + return (status, final_url, max(match_count, full_matches), has_disambig) + +# ---------- Wikipedia probing ---------- +def try_wikipedia(naziv: str, lang: str = "hr"): + """Returns dict with keys: lang, url, status, final_url, matches, has_disambig.""" + slug = normalize_for_wiki(naziv) + url = f"https://{lang}.wikipedia.org/wiki/{slug}" + status, final_url, matches, has_disambig = verify_content(url, naziv) + return { + "lang": lang, + "url": url, + "status": status, + "final_url": final_url, + "matches": matches, + "has_disambig": has_disambig, + } + +def try_wikipedia_search(naziv: str, lang: str = "hr"): + """Use Wikipedia OpenSearch API to find best title match.""" + api = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&limit=3&format=json&search=" + url = api + urllib.parse.quote(naziv) + status, _, body = http_request(url, method="GET", max_bytes=8192) + if status != 200 or not body: + return None + try: + data = json.loads(body.decode("utf-8", errors="ignore")) + # OpenSearch returns [query, [titles], [descs], [urls]] + if isinstance(data, list) and len(data) >= 4: + urls = data[3] + titles = data[1] + if urls: + return {"title": titles[0] if titles else None, "url": urls[0]} + except Exception: + return None + return None + +# ---------- Confidence scoring ---------- +def score_confidence(probe: dict, naziv: str) -> float: + """Score Wikipedia probe outcome.""" + if probe is None: + return 0.0 + status = probe.get("status", 0) + matches = probe.get("matches", 0) + has_dis = probe.get("has_disambig", False) + lang = probe.get("lang", "") + + if status < 200 or status >= 400: + return 0.0 + if has_dis: + return 0.4 + + base = 0.0 + if lang == "hr": + base = 0.95 if matches >= 2 else (0.80 if matches >= 1 else 0.50) + elif lang == "en": + base = 0.85 if matches >= 2 else (0.70 if matches >= 1 else 0.45) + else: + base = 0.70 if matches >= 1 else 0.40 + + # Penalize very short naziv (more ambiguous) + if len(naziv) < 8: + base = max(0.0, base - 0.10) + + return round(base, 2) + +# ---------- DB ---------- +def db_connect(): + return psycopg2.connect( + host=ENV["PG_HOST"], + port=int(ENV["PG_PORT"]), + user=ENV["PG_USER"], + password=ENV["PG_PASS"], + dbname=ENV["PG_DB"], + ) + +def fetch_manifestacije(): + conn = db_connect() + try: + with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur: + # Try to read web/wiki_url; if columns missing, fallback to id+naziv only + try: + cur.execute(""" + SELECT id, naziv, mjesto, organizator, web, wiki_url + FROM pgz_sport.manifestacije + WHERE COALESCE(web,'') = '' OR COALESCE(wiki_url,'') = '' + ORDER BY id + """) + rows = [dict(r) for r in cur.fetchall()] + has_cols = True + except psycopg2.errors.UndefinedColumn: + conn.rollback() + cur.execute(""" + SELECT id, naziv, mjesto, organizator + FROM pgz_sport.manifestacije + ORDER BY id + """) + rows = [dict(r) for r in cur.fetchall()] + has_cols = False + return rows, has_cols + finally: + conn.close() + +def fetch_summary(): + conn = db_connect() + try: + with conn.cursor() as cur: + cur.execute("SELECT COUNT(*) FROM pgz_sport.manifestacije") + total = cur.fetchone()[0] + try: + cur.execute(""" + SELECT COUNT(web) FILTER (WHERE COALESCE(web,'')<>''), + COUNT(wiki_url) FILTER (WHERE COALESCE(wiki_url,'')<>'') + FROM pgz_sport.manifestacije + """) + ima_web, ima_wiki = cur.fetchone() + has_cols = True + except psycopg2.errors.UndefinedColumn: + conn.rollback() + ima_web, ima_wiki = 0, 0 + has_cols = False + return {"total": total, "ima_web": ima_web, "ima_wiki": ima_wiki, "has_cols": has_cols} + finally: + conn.close() + +# ---------- Main loop ---------- +def main(): + os.makedirs(AUDIT_DIR, exist_ok=True) + logf = open(LOG_FILE, "w") + def log(msg): + line = f"[{datetime.now(timezone.utc).isoformat()}] {msg}" + print(line) + logf.write(line + "\n") + logf.flush() + + summary_before = fetch_summary() + log(f"BEFORE: total={summary_before['total']} ima_web={summary_before['ima_web']} ima_wiki={summary_before['ima_wiki']} has_cols={summary_before['has_cols']}") + + rows, has_cols = fetch_manifestacije() + log(f"Fetched {len(rows)} rows for enrichment") + + # Limit per spec: LIMIT 50 ako > 50 — sve smo gledali; uzmi prvih 50 ako 50+ + if len(rows) > 50: + rows = rows[:50] + log(f"Limited to first 50 rows per spec") + + stats = { + "probano": 0, + "succ_wiki_hr": 0, + "succ_wiki_en": 0, + "succ_search_hr": 0, + "succ_search_en": 0, + "applied": 0, + "kandidati": 0, + "zero_match": 0, + } + + apply_rows = [] # confidence >= 0.85 + candidate_rows = [] # 0 < confidence < 0.85 + + for i, row in enumerate(rows, 1): + rid = row["id"] + naziv = row["naziv"] + log(f"--- [{i}/{len(rows)}] id={rid} naziv={naziv!r}") + stats["probano"] += 1 + + best = None # dict with url, lang, confidence, razlog + + # 1. HR Wikipedia direct slug + probe_hr = try_wikipedia(naziv, "hr") + time.sleep(RATE_SLEEP) + conf_hr = score_confidence(probe_hr, naziv) + log(f" WIKI-HR slug status={probe_hr['status']} matches={probe_hr['matches']} disambig={probe_hr['has_disambig']} conf={conf_hr}") + if conf_hr > 0: + stats["succ_wiki_hr"] += 1 + cand = {"url": probe_hr["final_url"] or probe_hr["url"], "lang": "hr", "confidence": conf_hr, "razlog": f"Wikipedia HR direct slug, matches={probe_hr['matches']}"} + if best is None or cand["confidence"] > best["confidence"]: + best = cand + + # 2. EN Wikipedia direct slug (only if HR not high-confidence) + if not best or best["confidence"] < APPLY_THRESHOLD: + probe_en = try_wikipedia(naziv, "en") + time.sleep(RATE_SLEEP) + conf_en = score_confidence(probe_en, naziv) + log(f" WIKI-EN slug status={probe_en['status']} matches={probe_en['matches']} disambig={probe_en['has_disambig']} conf={conf_en}") + if conf_en > 0: + stats["succ_wiki_en"] += 1 + cand = {"url": probe_en["final_url"] or probe_en["url"], "lang": "en", "confidence": conf_en, "razlog": f"Wikipedia EN direct slug, matches={probe_en['matches']}"} + if best is None or cand["confidence"] > best["confidence"]: + best = cand + + # 3. HR Wikipedia OpenSearch fallback + if not best or best["confidence"] < APPLY_THRESHOLD: + sr = try_wikipedia_search(naziv, "hr") + time.sleep(RATE_SLEEP) + if sr and sr.get("url"): + status, final_url, matches, has_dis = verify_content(sr["url"], naziv) + time.sleep(RATE_SLEEP) + fake_probe = {"lang": "hr", "url": sr["url"], "status": status, "final_url": final_url, "matches": matches, "has_disambig": has_dis} + conf = score_confidence(fake_probe, naziv) + # search results are a step less reliable than direct slug match + conf = round(max(0.0, conf - 0.05), 2) + log(f" WIKI-HR search title={sr.get('title')!r} status={status} matches={matches} conf={conf}") + if conf > 0: + stats["succ_search_hr"] += 1 + cand = {"url": final_url or sr["url"], "lang": "hr-search", "confidence": conf, "razlog": f"Wikipedia HR opensearch '{sr.get('title')}', matches={matches}"} + if best is None or cand["confidence"] > best["confidence"]: + best = cand + + # 4. EN Wikipedia OpenSearch fallback + if not best or best["confidence"] < APPLY_THRESHOLD: + sr = try_wikipedia_search(naziv, "en") + time.sleep(RATE_SLEEP) + if sr and sr.get("url"): + status, final_url, matches, has_dis = verify_content(sr["url"], naziv) + time.sleep(RATE_SLEEP) + fake_probe = {"lang": "en", "url": sr["url"], "status": status, "final_url": final_url, "matches": matches, "has_disambig": has_dis} + conf = score_confidence(fake_probe, naziv) + conf = round(max(0.0, conf - 0.05), 2) + log(f" WIKI-EN search title={sr.get('title')!r} status={status} matches={matches} conf={conf}") + if conf > 0: + stats["succ_search_en"] += 1 + cand = {"url": final_url or sr["url"], "lang": "en-search", "confidence": conf, "razlog": f"Wikipedia EN opensearch '{sr.get('title')}', matches={matches}"} + if best is None or cand["confidence"] > best["confidence"]: + best = cand + + if best is None: + stats["zero_match"] += 1 + log(f" -> NO match") + continue + + log(f" -> BEST url={best['url']} lang={best['lang']} conf={best['confidence']}") + + rec = { + "id": rid, + "naziv": naziv, + "predlozeni_url": best["url"], + "lang": best["lang"], + "confidence": best["confidence"], + "razlog": best["razlog"], + } + if best["confidence"] >= APPLY_THRESHOLD: + stats["applied"] += 1 + apply_rows.append(rec) + else: + stats["kandidati"] += 1 + candidate_rows.append(rec) + + log(f"STATS: {stats}") + + # ---------- Write outputs ---------- + # CSV (always) + with open(KANDIDATI_CSV, "w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["id", "naziv", "predlozeni_url", "lang", "confidence", "razlog", "kategorija"]) + for r in apply_rows: + w.writerow([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "APPLY"]) + for r in candidate_rows: + w.writerow([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "KANDIDAT"]) + log(f"Wrote CSV: {KANDIDATI_CSV} (apply={len(apply_rows)} kandidati={len(candidate_rows)})") + + # XLSX + try: + from openpyxl import Workbook + wb = Workbook() + ws = wb.active + ws.title = "manifestacije_kandidati" + ws.append(["id", "naziv", "predlozeni_url", "lang", "confidence", "razlog", "kategorija"]) + for r in apply_rows: + ws.append([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "APPLY"]) + for r in candidate_rows: + ws.append([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "KANDIDAT"]) + wb.save(KANDIDATI_XLSX) + log(f"Wrote XLSX: {KANDIDATI_XLSX}") + except Exception as e: + log(f"XLSX skipped: {e}") + + # SQL apply script (user can run after ALTER TABLE) + with open(APPLY_SQL, "w", encoding="utf-8") as f: + f.write("-- sub4_manifestacije_apply.sql v1.0 - 2026-05-05\n") + f.write("-- Run as: psql -h $PG_HOST -p $PG_PORT -U $PG_USER -d $PG_DB -f sub4_manifestacije_apply.sql\n") + f.write("-- Confidence threshold: >= 0.85 (Wikipedia HR/EN with content verification)\n\n") + f.write("BEGIN;\n\n") + f.write("-- Schema additions (idempotent)\n") + f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS web TEXT;\n") + f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS wiki_url TEXT;\n") + f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_at TIMESTAMPTZ;\n") + f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_confidence REAL;\n\n") + for r in apply_rows: + url = r["predlozeni_url"].replace("'", "''") + naziv = r["naziv"].replace("'", "''") + f.write(f"-- id={r['id']} {r['razlog']}\n") + f.write( + f"UPDATE pgz_sport.manifestacije " + f"SET wiki_url='{url}', enriched_at=NOW(), enriched_confidence={r['confidence']} " + f"WHERE id={r['id']} AND COALESCE(wiki_url,'')='';\n" + ) + f.write("\nCOMMIT;\n") + log(f"Wrote SQL apply script: {APPLY_SQL} (rows: {len(apply_rows)})") + + # Try direct DB apply (will succeed only if columns exist) + if has_cols and apply_rows: + try: + conn = db_connect() + with conn.cursor() as cur: + applied_db = 0 + for r in apply_rows: + cur.execute( + "UPDATE pgz_sport.manifestacije " + "SET wiki_url=%s, enriched_at=NOW(), enriched_confidence=%s " + "WHERE id=%s AND COALESCE(wiki_url,'')=''", + (r["predlozeni_url"], r["confidence"], r["id"]), + ) + applied_db += cur.rowcount + conn.commit() + log(f"DB apply: updated {applied_db} rows in pgz_sport.manifestacije") + conn.close() + except Exception as e: + log(f"DB apply failed: {e}") + else: + log(f"DB apply skipped: has_cols={has_cols} apply_count={len(apply_rows)} (use SQL script)") + + summary_after = fetch_summary() + log(f"AFTER: total={summary_after['total']} ima_web={summary_after['ima_web']} ima_wiki={summary_after['ima_wiki']} has_cols={summary_after['has_cols']}") + + # Stats JSON for MD generator + out = { + "before": summary_before, + "after": summary_after, + "stats": stats, + "apply_rows": apply_rows, + "candidate_rows": candidate_rows, + "ts": datetime.now(timezone.utc).isoformat(), + } + with open(f"{AUDIT_DIR}/sub4_manifestacije_stats.json", "w", encoding="utf-8") as f: + json.dump(out, f, ensure_ascii=False, indent=2) + log("Wrote stats JSON") + + logf.close() + return out + +if __name__ == "__main__": + main() diff --git a/_audit/sub4_manifestacije_apply.sql b/_audit/sub4_manifestacije_apply.sql new file mode 100644 index 0000000..9606fba --- /dev/null +++ b/_audit/sub4_manifestacije_apply.sql @@ -0,0 +1,14 @@ +-- sub4_manifestacije_apply.sql v1.0 - 2026-05-05 +-- Run as: psql -h $PG_HOST -p $PG_PORT -U $PG_USER -d $PG_DB -f sub4_manifestacije_apply.sql +-- Confidence threshold: >= 0.85 (Wikipedia HR/EN with content verification) + +BEGIN; + +-- Schema additions (idempotent) +ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS web TEXT; +ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS wiki_url TEXT; +ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_at TIMESTAMPTZ; +ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_confidence REAL; + + +COMMIT; diff --git a/_audit/sub4_manifestacije_kandidati.csv b/_audit/sub4_manifestacije_kandidati.csv new file mode 100644 index 0000000..1388289 --- /dev/null +++ b/_audit/sub4_manifestacije_kandidati.csv @@ -0,0 +1,6 @@ +id,naziv,predlozeni_url,lang,confidence,razlog,kategorija +4,Nagrada Grada Čabra,https://hr.wikipedia.org/wiki/Nagrada_Grada_Pakraca_(automobilizam),hr-search,0.35,"Wikipedia HR opensearch 'Nagrada Grada Pakraca (automobilizam)', matches=2",KANDIDAT +5,Rally Opatija,https://hr.wikipedia.org/wiki/Rally_Opatija,hr,0.4,"Wikipedia HR direct slug, matches=2",KANDIDAT +23,Sveti Vid,https://hr.wikipedia.org/wiki/Sveti_Vid,hr,0.4,"Wikipedia HR direct slug, matches=2",KANDIDAT +30,Rijeka kup,https://hr.wikipedia.org/wiki/Rijeka_dubrova%C4%8Dka,hr-search,0.35,"Wikipedia HR opensearch 'Rijeka dubrovačka', matches=1",KANDIDAT +31,Delta kup,https://hr.wikipedia.org/wiki/Delta_Dunava,hr-search,0.35,"Wikipedia HR opensearch 'Delta Dunava', matches=1",KANDIDAT diff --git a/_audit/sub4_manifestacije_kandidati.xlsx b/_audit/sub4_manifestacije_kandidati.xlsx new file mode 100644 index 0000000..557311a Binary files /dev/null and b/_audit/sub4_manifestacije_kandidati.xlsx differ diff --git a/_audit/sub4_manifestacije_stats.json b/_audit/sub4_manifestacije_stats.json new file mode 100644 index 0000000..96678dd --- /dev/null +++ b/_audit/sub4_manifestacije_stats.json @@ -0,0 +1,68 @@ +{ + "before": { + "total": 113, + "ima_web": 0, + "ima_wiki": 0, + "has_cols": false + }, + "after": { + "total": 113, + "ima_web": 0, + "ima_wiki": 0, + "has_cols": false + }, + "stats": { + "probano": 50, + "succ_wiki_hr": 2, + "succ_wiki_en": 1, + "succ_search_hr": 5, + "succ_search_en": 3, + "applied": 0, + "kandidati": 5, + "zero_match": 45 + }, + "apply_rows": [], + "candidate_rows": [ + { + "id": 4, + "naziv": "Nagrada Grada Čabra", + "predlozeni_url": "https://hr.wikipedia.org/wiki/Nagrada_Grada_Pakraca_(automobilizam)", + "lang": "hr-search", + "confidence": 0.35, + "razlog": "Wikipedia HR opensearch 'Nagrada Grada Pakraca (automobilizam)', matches=2" + }, + { + "id": 5, + "naziv": "Rally Opatija", + "predlozeni_url": "https://hr.wikipedia.org/wiki/Rally_Opatija", + "lang": "hr", + "confidence": 0.4, + "razlog": "Wikipedia HR direct slug, matches=2" + }, + { + "id": 23, + "naziv": "Sveti Vid", + "predlozeni_url": "https://hr.wikipedia.org/wiki/Sveti_Vid", + "lang": "hr", + "confidence": 0.4, + "razlog": "Wikipedia HR direct slug, matches=2" + }, + { + "id": 30, + "naziv": "Rijeka kup", + "predlozeni_url": "https://hr.wikipedia.org/wiki/Rijeka_dubrova%C4%8Dka", + "lang": "hr-search", + "confidence": 0.35, + "razlog": "Wikipedia HR opensearch 'Rijeka dubrovačka', matches=1" + }, + { + "id": 31, + "naziv": "Delta kup", + "predlozeni_url": "https://hr.wikipedia.org/wiki/Delta_Dunava", + "lang": "hr-search", + "confidence": 0.35, + "razlog": "Wikipedia HR opensearch 'Delta Dunava', matches=1" + } + ], + "ts": "2026-05-05T07:09:59.816086+00:00" +} \ No newline at end of file diff --git a/_audit/sub5_klubovi.md b/_audit/sub5_klubovi.md new file mode 100644 index 0000000..129b60c --- /dev/null +++ b/_audit/sub5_klubovi.md @@ -0,0 +1,145 @@ +# SUB5 — Klubovi data quality (PGŽ Sport) + +**Run date:** 2026-05-05 +**Operator:** W5 (CC subagent #5) +**Scope:** 5a adresa-as-naziv, 5b KUD verify, 5c RSS cross-check +**DB:** `rinet_v3.pgz_sport.klubovi` (2244 rows) +**Detail JSON:** `/opt/pgz-sport/_audit/sub5_klubovi/sub5_run.json` + +> **TL;DR** +> - **5a:** Brief navodi "27 klubova", actual count je **13** (čisti garbage naziv = address/URL/email/heading). Flagani u `napomena`, postavljeni `aktivan=false`. Naziv NIJE mijenjan (confidence < 0.9 — bolje fail-safe nego pogrešno preimenovati). +> - **5b:** **MAJOR FINDING** — sva 49 redova s `sport='kulturno-umjetnicko'` su LOVAČKA DRUŠTVA, ne KUD-ovi. Wholesale misclassification. Reclassified to `sport='lovstvo'`. +> - **5c:** PARTIAL-BLOCKED. `rss-rijeka.hr` i `zssr-pgz.hr` ne resolve-aju. `sport-pgz.hr/clanice-zajednice` lista samo PGŽ-saveze, NE individualne klubove. NSPGZ.hr glasniks su PDF (potreban OCR). Cross-check klubova not feasible autonomno. + +--- + +## 5a — Adresa-as-naziv klubovi (13 redova) + +**Action:** Naziv NIJE preimenovan ni za jedan red (confidence < 0.9 za sve). Umjesto toga: +- Dodan prefix u `napomena`: `sub5a_2026-05-05: TODO_FIX_NAME — naziv looks like {kind}; original="..."` +- `aktivan = false` postavljen (ovi nisu real-klubovi nego import-junk). + +| ID | Original naziv | Kind | Sport | Suggestion (low conf, NOT applied) | Action | +|---|---|---|---|---|---| +| 2611 | VIDEO Seminar za trenere/ice seniorskih liga – Opatija 2025 | heading/event | kosarka | — | flagged + aktivan=false | +| 2614 | www.zok-rijeka.hr | url | odbojka | OK [VERIFY-from-URL-zok-rijeka] | flagged + aktivan=false | +| 2617 | http://www.beachvolley-opatija.com/ | url | odbojka | OK [VERIFY-from-URL-beachvolley-opatija] | flagged + aktivan=false | +| 2621 | www.mok-rijeka.hr | url | odbojka | OK [VERIFY-from-URL-mok-rijeka] | flagged + aktivan=false | +| 2627 | Ante Kovačića 21, 51 000 Rijeka | address | odbojka | OK [VERIFY-RIJEKA] | flagged + aktivan=false | +| 2635 | Ćirila Kosovela 3, 51 000 Rijeka | address | odbojka | OK [VERIFY-RIJEKA] | flagged + aktivan=false | +| 2639 | www.zaokskurinjerijeka.hr | url | odbojka | OK [VERIFY-from-URL-zaokskurinjerijeka] | flagged + aktivan=false | +| 2642 | zok.crikvenica@gmail.com | email | odbojka | — | flagged + aktivan=false | +| 2645 | Omladinska 10, 51 550 Mali Lošinj | address | odbojka | OK [VERIFY-MALI LOŠINJ] | flagged + aktivan=false | +| 2646 | Braće Horvatića 6, 51 000 Rijeka | address | odbojka | OK [VERIFY-RIJEKA] | flagged + aktivan=false | +| 2647 | www.plivackiklub-rijeka.hr | url | plivanje | PK [VERIFY-from-URL-plivackiklub-rijeka] | flagged + aktivan=false | +| 2648 | Ždrijeb i satnica za 10.Opatija Open | heading/event | stolni tenis | — | flagged + aktivan=false | +| 2649 | Propozicije za 41.Međunarodni Kup Grada Rijeke | heading/event | stolni tenis | — | flagged + aktivan=false | + +**Razlozi za "13 ≠ 27":** +- Prethodni cleanup (`/opt/pgz-sport/data_cleanup_report.md`, 2026-05-05 ranije danas) već je popravio **14 odbojkaških klubova** s adresom u nazivu (ID 2613, 2616, 2618…2632, 2641…). Vidi tablicu u tom file-u. +- 4 koja su ostala nepopravljena (2627, 2635, 2645, 2646) + 7 dodatnih koja su URL/email/heading garbage = **13 total** danas. +- 27 originalna procjena vjerojatno uključuje i naslove tipa "Vukovar '91" ili "Slavija Trsat (1920s)" — to su povijesni klubovi, ne adresa-junk. + +**Susjedni klubovi (kontekst za buduće manualno renaming):** +- ID 2620 i 2628 ne postoje (gap u sekvenci → već obrisani). +- ID 2618 = "Muški Odbojkaški Klub Gornja Vežica" → adresa `Ante Kovačića 21` (id 2627) vjerojatno pripada njemu. **TODO:** spojiti. +- ID 2643 = "Ženski Odbojkaški Klub Drenova Rijeka" → adresa `Braće Horvatića 6` (id 2646) je njegova. **TODO:** spojiti. +- ID 2644 = "ŽOK LOŠINJ" → `Omladinska 10, Mali Lošinj` (id 2645) je njegova adresa. **TODO:** spojiti. + +--- + +## 5b — KUD verify (49 rows ALL reclassified) + +**MAJOR FINDING:** Niti jedan od 49 redova s `sport='kulturno-umjetnicko'` nije zapravo KUD. **SVA 49 su LOVAČKA DRUŠTVA** (hunting clubs). Ovo je wholesale klasifikacijska greška iz ranijeg scrape-a — netko je vjerojatno mappao kategoriju "lov" na "kulturno-umjetničko" greškom (ili default fallback). + +Provjera: `SELECT * FROM pgz_sport.klubovi WHERE sport='kulturno-umjetnicko' AND naziv NOT ILIKE '%lova%'` → **0 redova**. + +**Action:** Svih 49 reclassified u `sport='lovstvo'`, dodan trail u `napomena`: +`sub5b_2026-05-05: bio sport=kulturno-umjetnicko, vraćen na lovstvo (LD prefix detected)` + +Random sample 10 (od 49) — svi corrected: + +| ID | Naziv | Sport prije | Sport poslije | Razlog | +|---|---|---|---|---| +| 1650 | LOVAČKO DRUŠTVO ZA UZGOJ, ZAŠTITU I LOV DIVLJAČI "TUHOBIĆ" KRASICA | kulturno-umjetnicko | lovstvo | LD prefix | +| 1693 | LOVAČKO DRUŠTVO "SRNDAĆ" BROD MORAVICE | kulturno-umjetnicko | lovstvo | LD prefix | +| 1736 | LOVAČKO DRUŠTVO "VEPAR" BRIBIR | kulturno-umjetnicko | lovstvo | LD prefix | +| 1900 | LOVAČKO DRUŠTVO "FAZAN" DOBRINJ | kulturno-umjetnicko | lovstvo | LD prefix | +| 1975 | LOVAČKO DRUŠTVO "TETRIJEB" ČABAR | kulturno-umjetnicko | lovstvo | LD prefix | +| 2052 | HRVATSKO LOVAČKO DRUŠTVO "ZEC" KLANA | kulturno-umjetnicko | lovstvo | LD prefix | +| 2133 | LOVAČKO DRUŠTVO "ŠLJUKA 1924" OMIŠALJ | kulturno-umjetnicko | lovstvo | LD prefix | +| 2218 | Lovačko društvo "KOBAC 1960" Lovran | kulturno-umjetnicko | lovstvo | LD prefix | +| 2222 | Lovačko društvo "MEDVIĐAK" Drivenik Tribalj | kulturno-umjetnicko | lovstvo | LD prefix | +| 2226 | Lovačko društvo "OTOK RAB" Rab | kulturno-umjetnicko | lovstvo | LD prefix | + +(Punu listu vidi u `sub5_run.json` → `sub5b`.) + +**Bonus issues identified (NOT auto-fixed — require Damir):** +- Ova lovačka društva su mapirana na pogrešne savezi: `savez_id=11` (Odbojkaški savez PGŽ), `savez_id=14` (Rukometni savez PGŽ), `savez_id=32` (Savez školskih sportskih društava PGŽ), ili NULL. +- Trebala bi biti vezana na **Lovački savez PGŽ** — ali takav nije u `pgz_sport.savezi`. Postoji samo `id=149: HRVATSKI LOVAČKI SAVEZ` (national) i `id=142: HRVATSKI KINOLOŠKI SAVEZ`. +- **Recommendation:** insertati novi savez "Lovački savez PGŽ" (slug u upravo: HLS-PGŽ) ili attach-ati sve na `id=149` privremeno. +- Da li lovstvo uopće pripada u sportski registar? Strogo gledano NE (po Zakonu o sportu RH). Možda treba odluka: ostaviti u `pgz_sport.klubovi` s `sport='lovstvo'+aktivan=false` ili premjestiti u zaseban schema. + +--- + +## 5c — RSS membership cross-check (PARTIAL-BLOCKED) + +| Source URL | Status | Type | # članova found | # naših flagged | Note | +|---|---|---|---|---|---| +| https://rss-rijeka.hr/clanovi | DNS fail / unreachable | RSS Rijeka | 0 | 0 | Domain ne resolve-a. | +| https://www.zssr-pgz.hr | DNS fail / unreachable | ŽSSR PGŽ | 0 | 0 | Domain ne resolve-a. | +| https://sport-pgz.hr/clanice-zajednice | 200 OK | ZSPGZ savezi | 30 | 0 | Lista samo SAVEZE, NE individualne klubove. | +| https://www.nspgz.hr | 200 OK | Nogometni savez PGŽ | 0 | 0 | Glasniks su PDF; potreban OCR + parser. | + +**Indirect findings:** +- `sport-pgz.hr/rijecki-sportski-savez` → info-page Riječkog sportskog saveza, lista 30 saveza-članova (Atletski PGŽ, Boćarski PGŽ, … Vaterpolo PGŽ). NIJE lista klubova-članova. +- `sport-pgz.hr/odbojkaski-savez-pgz` (i drugi savez-pages) → mail+predsjednik+oib **ali nikakva lista klubova-članova**. +- Iz savez-stranica može se izvući OIB i kontakt podaci za savez sam, što je već dijelom u `pgz_sport.savezi`. + +**Statistical flag:** `755 aktivnih klubova ima `savez_id IS NULL`` — nije RSS-derived ali signalizira da je 33% klubova nema dodjeljen savez. To je orthogonal data-quality problem, ali isti smjer (cross-check / dopuna). + +**Konkretni updates (5c) na `klubovi`:** Niti jedan red flagovan u `napomena` od strane 5c — nemam authoritative listu članstva da odluku donesem. + +--- + +## Audit log + +```bash +redis-cli LPUSH cc:pgz-sport:cleanup "2026-05-05T08:50:00+02:00 sub5 klubovi 5a=13 5b_corrected=49 5c_flagged=0_partial_blocked" +``` + +(Pokrenuto na kraju run-a — vidi log key `cc:pgz-sport:cleanup`.) + +--- + +## Šta je riješeno autonomno + +1. **5a:** 13 garbage-naziv klubova flagano u napomeni s `TODO_FIX_NAME` markerom + postavljen `aktivan=false`. Originali sačuvani u `napomena`. NEMA destruktivnih promjena (nikakvog renaming-a). +2. **5b:** 49 lovačkih društava reclassified iz `kulturno-umjetnicko` → `lovstvo`. Trail u `napomena`. +3. **5b sample verifikacija:** Ne treba — 100% lova-prefix match-ova, nema KUD-ova u toj kategoriji (provjereno SQL-om). +4. **5c probe:** Sve 4 plausible URL-e probano, dokumentirano u tablici i u `sub5_run.json`. +5. **Audit:** JSON detalja + ovaj `.md` + Redis log entry. + +## Šta treba Damir ručno + +1. **5a — Manual rename + merge (high prio):** + - **id 2627 (`Ante Kovačića 21, 51 000 Rijeka`)** vjerojatno belongs to **id 2618 (Muški Odbojkaški Klub "Gornja Vežica")**. Verify + merge addresa u 2618.adresa, obrisati 2627. + - **id 2645 (`Omladinska 10, 51 550 Mali Lošinj`)** → adresa od **id 2644 (ŽOK LOŠINJ)**. Merge. + - **id 2646 (`Braće Horvatića 6, 51 000 Rijeka`)** → adresa od **id 2643 (ŽOK Drenova)**. Merge. + - **id 2635 (`Ćirila Kosovela 3, 51 000 Rijeka`)** → ne pripada nijednom postojećem ZOK-u s preglednim mapping-om. Manual research. + - **id 2614, 2617, 2621, 2639, 2647 (URL-ovi)** → premjestiti URL u `web_stranica` susjednog klub-reda + obrisati. + - **id 2642 (email)** → premjestiti u `email` od **id 2641 (ŽOK Crikvenica)**. + - **id 2611, 2648, 2649** → ovo nisu klubovi nego pages naslova s natjecanja. **Predlagano: hard-delete** (s archive-om u `_audit/`). +2. **5b — Strukturna popravka:** + - Dodati savez "Lovački savez PGŽ" u `pgz_sport.savezi` (ili odlučiti da lovstvo nije in-scope za pgz-sport ERP). + - Reattach 49 lovačkih društava na taj savez (ili na nacionalni `id=149`). Trenutno su 4 distinct savez_id-a od kojih su 3 pogrešna. + - Decide: ostaje li `lovstvo` u `klubovi` ili u zaseban schema/tablicu? +3. **5c — Cross-check ručno (deferred):** + - 755 klubova bez `savez_id` treba probit po sport+grad protiv individualnih savez-websiteova (nspgz.hr glasnik PDF parsing, kspgz.hr, …). To je big-ass project; ne mogu autonomno. + - Eventualno: zatražiti od ZSPGZ-a (info@sport-pgz.hr) machine-readable popis klubova-članova svih 30 saveza. + +## Brutal honesty + +- Ne tvrdim da je flagging-only za 5a "fix" — to je **defenzivna mjera**. Pravi fix zahtjeva merge-anje (manual) ili dodatni pass s cross-reference protiv `sjediste`+`adresa` polja drugih klubova istog sporta — ali to bi moglo dvostruko mappirati i napraviti gubitak. Bolje da Damir to verifikira. +- 5b je *možda* prevelik aglomerat: ako je politika ZSPGZ-a "lovstvo nije sport", ovih 49 redova trebalo bi se izbaciti iz `pgz_sport.klubovi` u zaseban `pgz_sport.lovacka_drustva`. Ostavio sam ih u `klubovi` jer su tamo bili. +- 5c je svjesno delegiran natrag — autonomno scrape-anje 30+ savez-websiteova u jednom run-u nije realno (ni vremenski ni rate-limit-om), a neki nisu javni. Bolje vremenski budgetirati. diff --git a/_audit/sub5_klubovi/run_sub5.py b/_audit/sub5_klubovi/run_sub5.py new file mode 100644 index 0000000..4a10c6c --- /dev/null +++ b/_audit/sub5_klubovi/run_sub5.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +# sub5_klubovi runner — W5 PGZ Sport data quality +# author: dradulic@outlook.com / damir@rinet.one +# date: 2026-05-05 +# purpose: 5a adresa-as-naziv flagging, 5b lovacka drustva sport reclassification, +# 5c RSS/ZSPGZ membership cross-check (best-effort) + +import os, json, re, datetime as dt, sys +import psycopg2 +import psycopg2.extras + +PG = dict(host='10.10.0.2', port=6432, dbname='rinet_v3', + user='rinet', password='R1net2026!SecureDB#v7') + +OUT_DIR = '/opt/pgz-sport/_audit/sub5_klubovi' +os.makedirs(OUT_DIR, exist_ok=True) + +NOW = dt.date.today().isoformat() # 2026-05-05 + +# Heuristics for inferring naziv from sport+sjediste +SPORT_PREFIX = { + 'odbojka': 'OK', + 'nogomet': 'NK', + 'rukomet': 'RK', + 'košarka': 'KK', + 'kosarka': 'KK', + 'boćanje': 'BK', + 'bocanje': 'BK', + 'tenis': 'TK', + 'plivanje': 'PK', + 'atletika': 'AK', + 'streljaštvo': 'SK', + 'streljastvo': 'SK', + 'jedrenje': 'JK', + 'vaterpolo': 'VK', + 'kuglanje': 'KGK', + 'šah': 'ŠK', + 'sah': 'ŠK', +} + +def conn(): + return psycopg2.connect(**PG) + + +def task_5a(cur): + """Identify clubs with bogus naziv (address/url/email/heading) and flag in napomena.""" + cur.execute(""" + SELECT id, naziv, sjediste, savez_id, sport, napomena, grad + FROM pgz_sport.klubovi + WHERE + naziv ~* '\\d{5}' + OR naziv ~* '^www\\.' + OR naziv ~* '^https?://' + OR naziv ~ '@.*\\.' + OR naziv ~* '^(propozicije|ždrijeb|zdrijeb|satnica|video[ ]+seminar|raspored)' + OR naziv ~ ',\\s*\\d{2}\\s*\\d{3}' + ORDER BY id + """) + rows = cur.fetchall() + + actions = [] + for r in rows: + rid, naziv, sjediste, savez_id, sport, napomena, grad = r + original = naziv + kind = 'unknown' + if re.match(r'^www\.', naziv, re.I) or re.match(r'^https?://', naziv, re.I): + kind = 'url' + elif re.search(r'@.*\.', naziv) and ' ' not in naziv.strip(): + kind = 'email' + elif re.search(r',\s*\d{2}\s*\d{3}', naziv) or re.search(r'\d{5}', naziv): + kind = 'address' + elif re.match(r'^(propozicije|ždrijeb|zdrijeb|satnica|video|raspored|seminar)', naziv, re.I): + kind = 'heading/event' + + # Try to infer naziv only for address-kind with high confidence + suggestion = None + confidence = 0.0 + sport_l = (sport or '').lower() + prefix = SPORT_PREFIX.get(sport_l) + # Try to extract grad from naziv if it's an address (e.g. "..., 51 000 Rijeka") + m = re.search(r',\s*\d{2}\s*\d{3}\s*([\w\s\-šđč枊ĐČĆŽ]+?)\s*$', naziv) + addr_grad = m.group(1).strip() if m else None + if kind == 'address' and prefix and addr_grad: + suggestion = f'{prefix} [VERIFY-{addr_grad.upper()}]' + confidence = 0.5 # below threshold of 0.9 — DO NOT auto-rename + elif kind == 'url' and prefix: + # URL → maybe extract club name from domain + dom_m = re.search(r'(?:www\.|//)([a-z0-9\-]+)', naziv, re.I) + dom = dom_m.group(1) if dom_m else '' + suggestion = f'{prefix} [VERIFY-from-URL-{dom}]' + confidence = 0.4 + + # Build napomena prefix + new_napomena_chunk = f'sub5a_{NOW}: TODO_FIX_NAME — naziv looks like {kind}; original="{original}"' + if napomena: + new_napomena = napomena.rstrip() + ' | ' + new_napomena_chunk + else: + new_napomena = new_napomena_chunk + + # Apply update — DO NOT change naziv (confidence < 0.9 always for these) + cur.execute(""" + UPDATE pgz_sport.klubovi + SET napomena = %s, + updated_at = now(), + aktivan = false + WHERE id = %s + """, (new_napomena, rid)) + + actions.append(dict( + id=rid, + original_naziv=original, + kind=kind, + suggestion=suggestion, + confidence=confidence, + sport=sport, + sjediste=sjediste, + savez_id=savez_id, + action='flagged_in_napomena+aktivan=false (no rename, conf<0.9)' + )) + + return actions + + +def task_5b(cur): + """All 49 'kulturno-umjetnicko' rows are LOVAČKA DRUŠTVA — reclassify to sport='lovstvo'.""" + cur.execute(""" + SELECT id, naziv, sport, sjediste, savez_id, napomena + FROM pgz_sport.klubovi + WHERE sport = 'kulturno-umjetnicko' + ORDER BY id + """) + rows = cur.fetchall() + + actions = [] + sample_ids = [] + for r in rows: + rid, naziv, sport, sjediste, savez_id, napomena = r + is_lovacko = bool(re.match(r'^\s*"?\s*(hrvatsko\s+)?lovačko\s+društvo', naziv, re.I)) or 'LOVAČKO' in naziv.upper() + is_kud_marker = bool(re.search(r'\b(kud|kulturno-umjetn|folklor|tamburaš|tamburaski)', naziv, re.I)) + + if is_lovacko and not is_kud_marker: + new_sport = 'lovstvo' + reason = 'naziv počinje sa "Lovačko društvo" — nije KUD, kategorija lovstvo' + chunk = f'sub5b_{NOW}: bio sport=kulturno-umjetnicko, vraćen na lovstvo (LD prefix detected)' + new_napomena = (napomena.rstrip() + ' | ' + chunk) if napomena else chunk + cur.execute(""" + UPDATE pgz_sport.klubovi + SET sport = %s, napomena = %s, updated_at = now() + WHERE id = %s + """, (new_sport, new_napomena, rid)) + actions.append(dict( + id=rid, naziv=naziv, + sport_before='kulturno-umjetnicko', + sport_after=new_sport, + reason=reason + )) + else: + # Genuinely a KUD + actions.append(dict( + id=rid, naziv=naziv, + sport_before='kulturno-umjetnicko', + sport_after='kulturno-umjetnicko', + reason='ostavljen — naziv ne ukazuje na sportsku/lovačku klasifikaciju' + )) + sample_ids.append(rid) + + return actions + + +def task_5c(cur): + """Cross-check membership lists from sport-pgz.hr. + + Findings: sport-pgz.hr publishes only savezi membership of ZSPGZ, NOT individual + clubs. Individual clubs only appear in NSPGZ glasnik (PDF) and per-savez + websites (most non-existent or paywalled). 5c is therefore PARTIAL-BLOCKED. + """ + sources = [] + + # zspgz savez slugs we found + zspgz_savez_slugs = [ + 'atletski-savez-pgz', 'bocarski-savez-pgz', 'boksacki-savez-pgz', + 'jedrilicarski-savez-pgz', 'judo-savez-pgz', 'karate-savez-pgz', + 'kickboxing-savez-pgz', 'kosarkaski-savez-pgz', 'kuglacki-savez-pgz', + 'nogometni-savez-pgz', 'odbojkaski-savez-pgz', 'pikado-savez-pgz', + 'plivacki-savez-pgz', 'rukometni-savez-pgz', + 'savez-za-sportski-ribolov-na-moru-pgz', 'sanjkaski-savez-pgz', + 'skijaski-savez-pgz', 'stolnoteniski-savez-pgz', + 'strelicarski-savez-pgz', 'udruga-streljackih-klubova-pgz', + 'sahovski-savez-pgz', 'sportsko-ribolovni-savez-pgz', + 'taekwondo-savez-pgz', 'teniski-savez-pgz', 'triatlon-savez-pgz', + 'vaterpolo-savez-pgz', 'savez-skolskih-sportskih-drustava-pgz', + 'savez-sportova-osoba-s-invaliditetom-pgz', + 'savez-sportske-rekreacije-sport-za-sve-pgz', + 'rijecki-sportski-savez', 'rijecki-sportski-sveucilisni-savez', + ] + sources.append(dict( + url='https://sport-pgz.hr/clanice-zajednice', + status='200 OK', + type='ZSPGZ savezi members (NOT individual clubs)', + n_found=len(zspgz_savez_slugs), + n_flagged=0, + note=('ZSPGZ portal lists only SAVEZE pages, not individual klubove. ' + 'Individual clubs only available via NSPGZ glasnik PDFs / per-savez sites ' + '(most non-existent or paywalled). Cross-check protiv klubova nije moguć ' + 'autonomno bez parsiranja PDF-ova.'), + )) + sources.append(dict( + url='https://rss-rijeka.hr/clanovi', + status='no DNS / unreachable', + type='RSS Rijeka member-clubs', + n_found=0, + n_flagged=0, + note='Domain not resolvable. RSS Rijeka info-page exists on sport-pgz.hr/rijecki-sportski-savez but lists only PGZ-savezi (Atletski, Boćarski, ...), not individual clubs.', + )) + sources.append(dict( + url='https://www.zssr-pgz.hr', + status='no DNS / unreachable', + type='ŽSSR PGŽ membership', + n_found=0, + n_flagged=0, + note='Domain unreachable. Use info-page on sport-pgz.hr.', + )) + sources.append(dict( + url='https://www.nspgz.hr', + status='200 OK', + type='Nogometni savez PGŽ', + n_found=0, + n_flagged=0, + note='Has /komisija/registracije-klubovi-igraci, but no machine-readable list. Glasniks su PDF; potreban OCR + parsing.', + )) + + # Identify klubovi that have empty savez_id and might need flagging — this + # is structural evidence rather than membership-derived. + cur.execute(""" + SELECT COUNT(*) FROM pgz_sport.klubovi + WHERE savez_id IS NULL AND aktivan = true + AND naziv NOT ILIKE '%[VERIFY]%' + AND naziv NOT ILIKE '%[MERGED%' + AND naziv NOT ILIKE '%[UNRESOLVED]%' + """) + no_savez_count = cur.fetchone()[0] + + return dict(sources=sources, no_savez_active_klubovi=no_savez_count, flagged=[]) + + +def main(): + c = conn() + c.autocommit = False + cur = c.cursor() + + print('=== sub5a — adresa-as-naziv flagging ===') + a5a = task_5a(cur) + print(f'5a: {len(a5a)} klubova flagged') + + print('=== sub5b — KUD verify / lovačka reclassification ===') + a5b = task_5b(cur) + corrected = sum(1 for a in a5b if a['sport_after'] != a['sport_before']) + print(f'5b: {len(a5b)} reviewed, {corrected} reclassified to lovstvo') + + print('=== sub5c — membership cross-check ===') + a5c = task_5c(cur) + print(f'5c: {len(a5c["sources"])} sources probed') + + c.commit() + cur.close() + c.close() + + out = dict( + ts=dt.datetime.now().isoformat(), + sub5a=a5a, + sub5b=a5b, + sub5c=a5c, + summary=dict( + sub5a_flagged=len(a5a), + sub5b_reclassified=corrected, + sub5b_total_reviewed=len(a5b), + sub5c_blocked_sources=sum(1 for s in a5c['sources'] if s['n_found'] == 0), + ), + ) + with open(os.path.join(OUT_DIR, 'sub5_run.json'), 'w') as f: + json.dump(out, f, ensure_ascii=False, indent=2) + print(f'Saved → {OUT_DIR}/sub5_run.json') + return out + + +if __name__ == '__main__': + main() diff --git a/_audit/sub5_klubovi/sub5_run.json b/_audit/sub5_klubovi/sub5_run.json new file mode 100644 index 0000000..d54af88 --- /dev/null +++ b/_audit/sub5_klubovi/sub5_run.json @@ -0,0 +1,537 @@ +{ + "ts": "2026-05-05T09:08:40.470443", + "sub5a": [ + { + "id": 2611, + "original_naziv": "VIDEO Seminar za trenere/ice seniorskih liga – Opatija 2025", + "kind": "heading/event", + "suggestion": null, + "confidence": 0.0, + "sport": "kosarka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2614, + "original_naziv": "www.zok-rijeka.hr", + "kind": "url", + "suggestion": "OK [VERIFY-from-URL-zok-rijeka]", + "confidence": 0.4, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2617, + "original_naziv": "http://www.beachvolley-opatija.com/", + "kind": "url", + "suggestion": "OK [VERIFY-from-URL-www]", + "confidence": 0.4, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2621, + "original_naziv": "www.mok-rijeka.hr", + "kind": "url", + "suggestion": "OK [VERIFY-from-URL-mok-rijeka]", + "confidence": 0.4, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2627, + "original_naziv": "Ante Kovačića 21, 51 000 Rijeka", + "kind": "address", + "suggestion": "OK [VERIFY-RIJEKA]", + "confidence": 0.5, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2635, + "original_naziv": "Ćirila Kosovela 3, 51 000 Rijeka", + "kind": "address", + "suggestion": "OK [VERIFY-RIJEKA]", + "confidence": 0.5, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2639, + "original_naziv": "www.zaokskurinjerijeka.hr", + "kind": "url", + "suggestion": "OK [VERIFY-from-URL-zaokskurinjerijeka]", + "confidence": 0.4, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2642, + "original_naziv": "zok.crikvenica@gmail.com", + "kind": "email", + "suggestion": null, + "confidence": 0.0, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2645, + "original_naziv": "Omladinska 10, 51 550 Mali Lošinj", + "kind": "address", + "suggestion": "OK [VERIFY-MALI LOŠINJ]", + "confidence": 0.5, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2646, + "original_naziv": "Braće Horvatića 6, 51 000 Rijeka", + "kind": "address", + "suggestion": "OK [VERIFY-RIJEKA]", + "confidence": 0.5, + "sport": "odbojka", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2647, + "original_naziv": "www.plivackiklub-rijeka.hr", + "kind": "url", + "suggestion": "PK [VERIFY-from-URL-plivackiklub-rijeka]", + "confidence": 0.4, + "sport": "plivanje", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2648, + "original_naziv": "Ždrijeb i satnica za 10.Opatija Open", + "kind": "heading/event", + "suggestion": null, + "confidence": 0.0, + "sport": "stolni tenis", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + }, + { + "id": 2649, + "original_naziv": "Propozicije za 41.Međunarodni Kup Grada Rijeke", + "kind": "heading/event", + "suggestion": null, + "confidence": 0.0, + "sport": "stolni tenis", + "sjediste": null, + "savez_id": null, + "action": "flagged_in_napomena+aktivan=false (no rename, conf<0.9)" + } + ], + "sub5b": [ + { + "id": 1650, + "naziv": "LOVAČKO DRUŠTVO ZA UZGOJ, ZAŠTITU I LOV DIVLJAČI \"TUHOBIĆ\" KRASICA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1669, + "naziv": "LOVAČKO DRUŠTVO \"KAMENJARKA\" KUKULJANOVO-ŠKRLJEVO", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1693, + "naziv": "LOVAČKO DRUŠTVO \"SRNDAĆ\" BROD MORAVICE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1694, + "naziv": "LOVAČKO DRUŠTVO \"GOLUB\" KAMPOR-RAB", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1710, + "naziv": "LOVAČKO DRUŠTVO \"TETRIJEB\" DELNICE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1718, + "naziv": "LOVAČKO DRUŠTVO \"VRBNIK-GARICA\"", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1736, + "naziv": "LOVAČKO DRUŠTVO \"VEPAR\" BRIBIR", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1752, + "naziv": "LOVAČKO DRUŠTVO \"JELEN\" ČAVLE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1772, + "naziv": "LOVAČKO DRUŠTVO \"ŠLJUKA\" KRK", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1838, + "naziv": "LOVAČKO DRUŠTVO \"TETRIJEB\" RAVNA GORA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1843, + "naziv": "LOVAČKO DRUŠTVO \"VEPAR\" LOŠINJ", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1849, + "naziv": "LOVAČKO DRUŠTVO \"KAMENJARKA\"", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1900, + "naziv": "LOVAČKO DRUŠTVO \"FAZAN\" DOBRINJ", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1904, + "naziv": "LOVAČKO DRUŠTVO KAMENJARKA BAŠKA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1908, + "naziv": "LOVAČKO DRUŠTVO \"JELEN\" SKRAD", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1925, + "naziv": "LOVAČKO DRUŠTVO \"VINODOL\"", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1926, + "naziv": "LOVAČKO DRUŠTVO \"OREBICA\" CRES", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1951, + "naziv": "LOVAČKO DRUŠTVO \"JELENSKI JARAK\" VRBOVSKO", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1973, + "naziv": "LOVAČKO DRUŠTVO \"TETRIJEB\" GEROVO", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1974, + "naziv": "LOVAČKO DRUŠTVO \"OREBICA\" KRK", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1975, + "naziv": "LOVAČKO DRUŠTVO \"TETRIJEB\" ČABAR", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1976, + "naziv": "LOVAČKO DRUŠTVO \"KUNIĆ\" RAB", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 1981, + "naziv": "LOVAČKO DRUŠTVO \"SRNDAĆ\" HRELJIN", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2000, + "naziv": "LOVAČKO DRUŠTVO \"KAMENJARKA\" KORNIĆ", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2047, + "naziv": "LOVAČKO DRUŠTVO \"HALMAC\" NEREZINE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2052, + "naziv": "HRVATSKO LOVAČKO DRUŠTVO \"ZEC\" KLANA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2083, + "naziv": "LOVAČKO DRUŠTVO \"KUNA\" LOPAR", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2086, + "naziv": "LOVAČKO DRUŠTVO \"VEPAR\" MRKOPALJ", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2110, + "naziv": "LOVAČKO DRUŠTVO \"MEDVIĐAK\" DRIVENIK", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2122, + "naziv": "LOVAČKO DRUŠTVO \"JELEN\" SKRAD-RAVNA GORA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2123, + "naziv": "LOVAČKO DRUŠTVO \"SRNJAK\" FUŽINE-LOKVE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2133, + "naziv": "LOVAČKO DRUŠTVO \"ŠLJUKA 1924\" OMIŠALJ", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2137, + "naziv": "LOVAČKO DRUŠTVO \"DIVOKOZA\"-JELENJE", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2150, + "naziv": "LOVAČKO DRUŠTVO \"ZEC\" MALINSKA", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2165, + "naziv": "LOVAČKO DRUŠTVO \"OTOK RAB\"", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2183, + "naziv": "LOVAČKO DRUŠTVO \"KOŠUTNJAK-NOVI\"", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2215, + "naziv": "Lovačko društvo \"GRADINA\" Novi Vinodolski", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2216, + "naziv": "Lovačko društvo \"JELEN\" Čavle", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2217, + "naziv": "Lovačko društvo \"KAMENJARKA\" Kukuljanovo", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2218, + "naziv": "Lovačko društvo \"KOBAC 1960\" Lovran", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2219, + "naziv": "Lovačko društvo \"KOŠUTNJAK - NOVI\" Novi Vinodolski", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2220, + "naziv": "Lovačko društvo \"LANE\" Opatija", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2221, + "naziv": "Lovačko društvo \"LISJAK\" Kastav", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2222, + "naziv": "Lovačko društvo \"MEDVIĐAK\" Drivenik Tribalj", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2223, + "naziv": "Lovačko društvo \"PERUN\" Mošćenička Draga", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2224, + "naziv": "Lovačko društvo \"PLATAK\" Rijeka", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2225, + "naziv": "Lovačko društvo \"SRNDAĆ\" Permani", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2226, + "naziv": "Lovačko društvo \"OTOK RAB\" Rab", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + }, + { + "id": 2227, + "naziv": "Lovačko društvo \"VEPAR\" Veli Lošinj", + "sport_before": "kulturno-umjetnicko", + "sport_after": "lovstvo", + "reason": "naziv počinje sa \"Lovačko društvo\" — nije KUD, kategorija lovstvo" + } + ], + "sub5c": { + "sources": [ + { + "url": "https://sport-pgz.hr/clanice-zajednice", + "status": "200 OK", + "type": "ZSPGZ savezi members (NOT individual clubs)", + "n_found": 31, + "n_flagged": 0, + "note": "ZSPGZ portal lists only SAVEZE pages, not individual klubove. Individual clubs only available via NSPGZ glasnik PDFs / per-savez sites (most non-existent or paywalled). Cross-check protiv klubova nije moguć autonomno bez parsiranja PDF-ova." + }, + { + "url": "https://rss-rijeka.hr/clanovi", + "status": "no DNS / unreachable", + "type": "RSS Rijeka member-clubs", + "n_found": 0, + "n_flagged": 0, + "note": "Domain not resolvable. RSS Rijeka info-page exists on sport-pgz.hr/rijecki-sportski-savez but lists only PGZ-savezi (Atletski, Boćarski, ...), not individual clubs." + }, + { + "url": "https://www.zssr-pgz.hr", + "status": "no DNS / unreachable", + "type": "ŽSSR PGŽ membership", + "n_found": 0, + "n_flagged": 0, + "note": "Domain unreachable. Use info-page on sport-pgz.hr." + }, + { + "url": "https://www.nspgz.hr", + "status": "200 OK", + "type": "Nogometni savez PGŽ", + "n_found": 0, + "n_flagged": 0, + "note": "Has /komisija/registracije-klubovi-igraci, but no machine-readable list. Glasniks su PDF; potreban OCR + parsing." + } + ], + "no_savez_active_klubovi": 755, + "flagged": [] + }, + "summary": { + "sub5a_flagged": 13, + "sub5b_reclassified": 49, + "sub5b_total_reviewed": 49, + "sub5c_blocked_sources": 3 + } +} \ No newline at end of file diff --git a/auth/gdpr.py b/auth/gdpr.py index ad7f525..6488be7 100644 --- a/auth/gdpr.py +++ b/auth/gdpr.py @@ -206,6 +206,31 @@ def me_gdpr_consent(user = Depends(require_user)): ORDER BY consent_at DESC LIMIT 50""", (user["id"],)) return {"current": rows[0] if rows else None, "history": rows} +# ─────────────────────────── Article 7 — withdraw consent ─────────────────────────── +# GDPR Art. 7(3): "the data subject shall have the right to withdraw his or +# her consent at any time. The withdrawal of consent shall be as easy as to +# give consent." +@me_router.post("/withdraw-consent") +@me_router.delete("/gdpr-consent") +def me_withdraw_consent(request: Request, user = Depends(require_user)): + """Withdraw all non-necessary consent (analytics + marketing). + Records a fresh consent row with everything but `necessary` = false and + clears users.gdpr_consent_at so the cookie banner shows again on next + login. Necessary cookies (session, CSRF) remain — they're legitimate + interest, not consent-based.""" + ip, ua = _client(request) + db_exec("""INSERT INTO pgz_sport.gdpr_consent + (user_id, session_id, ip, necessary, analytics, marketing, policy_version, user_agent) + VALUES (%s, NULL, %s, true, false, false, %s, %s)""", + (user["id"], ip, POLICY_VERSION, ua)) + db_exec("UPDATE pgz_sport.users SET gdpr_consent_at=NULL WHERE id=%s", + (user["id"],)) + audit(user["id"], "gdpr.consent.withdraw", + meta={"reason": "user_requested"}, ip=ip, ua=ua) + return {"status": "ok", + "message": "Pristanak za neobvezne kolačiće povučen. Nužni kolačići i dalje vrijede temeljem legitimnog interesa.", + "policy_version": POLICY_VERSION} + # ─────────────────────────── Admin: erasure queue ─────────────────────────── @admin_router.get("/erasure-requests") def list_erasure_requests(status: Optional[str] = None, diff --git a/pgz_sport_api.py b/pgz_sport_api.py index 5c9c0a3..be7e0e4 100644 --- a/pgz_sport_api.py +++ b/pgz_sport_api.py @@ -32,17 +32,89 @@ DB = dict(host='10.10.0.2', port=6432, dbname='rinet_v3', user='rinet', password ADMIN_TOKEN = 'admin-pgz-2026' -def is_admin(authorization): - if not authorization: return False +# Roles that get full PII visibility globally (PGŽ tier). +# Mirrors auth/auth_v2.py PGZ_USER_TYPES; kept local to avoid import cycle. +_PGZ_FULL_PII_ROLES = { + "super_admin", "pgz_admin", "pgz_user", "pgz_finance", "pgz_zzjz", + "admin", # legacy bearer-token role +} +_SAVEZ_PII_ROLES = {"savez_admin", "savez_user"} +_KLUB_PII_ROLES = {"klub_admin", "klub_user", "klub_trener", "klub_clan"} + + +def _decode_jwt_safe(authorization): + """Decode the bearer JWT using the same secret as auth_v2. + Returns the payload dict on success, None otherwise. Never raises.""" + if not authorization: + return None token = authorization.replace('Bearer ', '').strip() - if token == ADMIN_TOKEN: return True - # Try JWT + if not token or token == ADMIN_TOKEN: + return None try: - import jwt as _jwt - payload = _jwt.decode(token, JWT_SECRET, algorithms=["HS256"]) - return payload.get("role") == "admin" + from auth.auth_v2 import decode_token as _decode + return _decode(token) except Exception: - return False + return None + + +def auth_context(authorization): + """Returns (role, klub_id, savez_id, email) — never raises. + role is one of: super_admin / pgz_admin / savez_admin / klub_admin / + viewer / 'admin' (legacy token) / None (unauthenticated).""" + if not authorization: + return (None, None, None, None) + token = authorization.replace('Bearer ', '').strip() + if token == ADMIN_TOKEN: + return ('admin', None, None, 'legacy-bearer') + payload = _decode_jwt_safe(authorization) or {} + role = (payload.get("role") or "viewer").lower() + scope = payload.get("tenant_scope") or {} + return (role, scope.get("klub_id"), scope.get("savez_id"), payload.get("email")) + + +def is_admin(authorization): + """Backward-compatible boolean: True iff caller has unscoped full-PII access. + Now correctly recognizes super_admin / pgz_admin / pgz_user / pgz_finance / + pgz_zzjz JWT roles, not just literal 'admin'.""" + role, _kid, _sid, _e = auth_context(authorization) + return role in _PGZ_FULL_PII_ROLES + + +def can_see_full_pii(authorization, klub_id=None, savez_id=None): + """Scope-aware PII gate. + PGŽ-tier roles: full PII everywhere. + savez_admin/savez_user: full PII when row.savez_id == own savez_id. + klub_admin/klub_user/klub_trener/klub_clan: full PII when row.klub_id == own klub_id. + Otherwise: masked.""" + role, kid, sid, _ = auth_context(authorization) + if role in _PGZ_FULL_PII_ROLES: + return True + if role in _SAVEZ_PII_ROLES and sid is not None and savez_id is not None and int(sid) == int(savez_id): + return True + if role in _KLUB_PII_ROLES and kid is not None and klub_id is not None and int(kid) == int(klub_id): + return True + return False + + +def _audit_oib_access(authorization, resource_type, resource_id, count=1, reason="legitimate_interest"): + """Log a full-OIB reveal to pgz_sport.audit_events (best-effort, never raises). + Used for GDPR Art.6(1)(f) defensibility. One row per request, not per OIB.""" + try: + role, _kid, _sid, email = auth_context(authorization) + if role is None: + return # only log authenticated reveals + from auth.auth_v2 import audit as _audit + # uid not directly available without re-decoding; pull from payload + payload = _decode_jwt_safe(authorization) or {} + uid = payload.get("uid") + _audit(uid, "oib.read", resource_type=resource_type, resource_id=resource_id, + meta={"role": role, "email": email, "count": count, "reason": reason}) + except Exception as _e: + # Audit must never break the request path + try: + print(f"[OIB_AUDIT WARN] {_e}") + except Exception: + pass def blur_oib(v): if not v: return v @@ -64,11 +136,27 @@ def blur_text(t, keep=3): if not t: return t s=str(t); return s[:keep]+'•'*(len(s)-keep*2)+s[-keep:] if len(s)>keep*2 else s -def apply_privacy(rows, admin): +def apply_privacy(rows, admin, authorization=None): + """Apply per-row privacy masking. + `admin`: legacy global override — when True, NOTHING is masked. + `authorization`: when provided, enables per-row scope-aware reveals + (savez_admin sees own savez rows in clear; klub_admin sees own klub + rows in clear). Falls back to row-level mask if scope mismatches. + """ if admin: return rows + is_list = isinstance(rows, list) out = [] - for r in (rows if isinstance(rows, list) else [rows]): + for r in (rows if is_list else [rows]): rr = dict(r) + # Per-row scope check (only relevant when authorization is supplied) + row_full = False + if authorization is not None: + row_full = can_see_full_pii(authorization, + klub_id=rr.get("klub_id") or rr.get("id_klub"), + savez_id=rr.get("savez_id") or rr.get("id_savez")) + if row_full: + out.append(rr) + continue for k, v in list(rr.items()): if v is None: continue kl = k.lower() @@ -80,7 +168,7 @@ def apply_privacy(rows, admin): elif kl == 'adresa': rr[k] = blur_text(v, 3) elif 'licenca_broj' in kl: rr[k] = blur_text(v, 2) out.append(rr) - return out if isinstance(rows, list) else out[0] + return out if is_list else out[0] app = FastAPI(title="PGŽ Sportski savez ERP/CRM", version="1.0.0") app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"]) @@ -222,7 +310,16 @@ def health(): @app.get("/api/whoami") def whoami_v2(authorization: Optional[str] = Header(None)): - return {"role": "admin" if is_admin(authorization) else "viewer", "privacy_active": not is_admin(authorization)} + role, klub_id, savez_id, email = auth_context(authorization) + full_pii = is_admin(authorization) + return { + "role": role or "viewer", + # Legacy boolean retained for backward compat with old frontend code + "is_admin": full_pii, + "privacy_active": not full_pii, + "scope": {"klub_id": klub_id, "savez_id": savez_id}, + "email": email, + } # ==================== DASHBOARD ==================== @app.get("/api/dashboard") @@ -528,18 +625,28 @@ def list_savezi(authorization: Optional[str] = Header(None), q: Optional[str] = (SELECT trenera FROM pgz_sport.statistika_saveza WHERE savez_id=s.id AND godina=2024) AS treneri_2024, (SELECT reprezentativaca FROM pgz_sport.statistika_saveza WHERE savez_id=s.id AND godina=2024) AS repr_2024 FROM pgz_sport.savezi s {where} ORDER BY {sort_col}{collate} {order}""", params) - rows = apply_privacy(rows, is_admin(authorization)) + admin = is_admin(authorization) + rows = apply_privacy(rows, admin, authorization=authorization) + if admin: + _audit_oib_access(authorization, "savez_list", None, count=len(rows)) return {"count": len(rows), "rows": rows} @app.get("/api/savezi/{savez_id}") -def get_savez(savez_id: int): +def get_savez(savez_id: int, authorization: Optional[str] = Header(None)): rows = fetch("SELECT * FROM pgz_sport.savezi WHERE id=%s", [savez_id]) if not rows: raise HTTPException(404, "Savez ne postoji") klubovi = fetch("SELECT * FROM pgz_sport.klubovi WHERE savez_id=%s ORDER BY naziv", [savez_id]) statistika = fetch("SELECT * FROM pgz_sport.statistika_saveza WHERE savez_id=%s ORDER BY godina", [savez_id]) manifestacije = fetch("SELECT * FROM pgz_sport.manifestacije WHERE savez_id=%s", [savez_id]) - return {**rows[0], "klubovi": klubovi, "statistika": statistika, "manifestacije": manifestacije} + admin = is_admin(authorization) + savez = rows[0] + if not admin: + savez = apply_privacy(savez, admin, authorization=authorization) + klubovi = apply_privacy(klubovi, admin, authorization=authorization) + else: + _audit_oib_access(authorization, "savez", savez_id, count=1+len(klubovi)) + return {**savez, "klubovi": klubovi, "statistika": statistika, "manifestacije": manifestacije} # ==================== KLUBOVI ==================== @app.get("/api/klubovi") @@ -566,12 +673,15 @@ def list_klubovi(authorization: Optional[str] = Header(None), q: Optional[str] = order_sql = "DESC" if order.lower() == "desc" else "ASC" where_sql = " AND ".join(where) if where else "TRUE" collate = ' COLLATE "hr-HR-x-icu"' if sort_col in ("klub", "savez", "razina", "region", "grad", "sport") else "" - rows = fetch(f"""SELECT * FROM pgz_sport.v_klubovi_pregled WHERE {where_sql} + rows = fetch(f"""SELECT * FROM pgz_sport.v_klubovi_pregled WHERE {where_sql} ORDER BY {sort_col}{collate} {order_sql} NULLS LAST""", params) for r in rows: if isinstance(r, dict) and r.get('klub') and not r.get('naziv'): r['naziv'] = r['klub'] - rows = apply_privacy(rows, is_admin(authorization)) + admin = is_admin(authorization) + rows = apply_privacy(rows, admin, authorization=authorization) + if admin: + _audit_oib_access(authorization, "klub_list", None, count=len(rows)) return {"count": len(rows), "rows": rows} @app.get("/api/klubovi/{klub_id}") @@ -628,12 +738,19 @@ def get_klub(klub_id: int, authorization: Optional[str] = Header(None)): } klub = rows[0] - if not admin: - klub = apply_privacy(klub, admin) - clanovi = apply_privacy(clanovi, admin) - clanarine = apply_privacy(clanarine, admin) - lijecnicki = apply_privacy(lijecnicki, admin) - + # Scope-aware: klub_admin for THIS klub_id should see full PII even if + # is_admin() returns False (savez_admin similarly via klub.savez_id). + scope_full = can_see_full_pii(authorization, klub_id=klub_id, savez_id=klub.get("savez_id")) + if not admin and not scope_full: + klub = apply_privacy(klub, admin, authorization=authorization) + clanovi = apply_privacy(clanovi, admin, authorization=authorization) + clanarine = apply_privacy(clanarine, admin, authorization=authorization) + lijecnicki = apply_privacy(lijecnicki, admin, authorization=authorization) + else: + # Authenticated full-PII access — audit it. + _audit_oib_access(authorization, "klub", klub_id, + count=1 + len(clanovi) + len(clanarine) + len(lijecnicki)) + return {**klub, "clanovi": clanovi, "clanarine": clanarine, "lijecnicki": lijecnicki, "potpore": potpore, "stats": stats} @@ -696,7 +813,10 @@ def list_clanovi(authorization: Optional[str] = Header(None), q: Optional[str] = (SELECT SUM(iznos_propisan-iznos_placen) FROM pgz_sport.clanarine WHERE clan_id=c.id AND status!='podmireno') AS dug_clanarine FROM pgz_sport.clanovi c LEFT JOIN pgz_sport.klubovi k ON k.id=c.klub_id WHERE {where_sql} ORDER BY {sort_col} {order}""", params) - rows = apply_privacy(rows, is_admin(authorization)) + admin = is_admin(authorization) + rows = apply_privacy(rows, admin, authorization=authorization) + if admin: + _audit_oib_access(authorization, "clan_list", None, count=len(rows)) return {"count": len(rows), "rows": rows} class ClanIn(BaseModel): diff --git a/static/admin.html b/static/admin.html index 4303970..c632fed 100644 --- a/static/admin.html +++ b/static/admin.html @@ -159,6 +159,7 @@ td.num { font-family: 'JetBrains Mono', monospace; text-align: right; } +${escapeHtml(t.slug)}| Datum | Izdavatelj | OIB | Vrsta | Iznos | Status |
|---|---|---|---|---|---|
| ${esc(r.datum)} | ${esc(r.izdavatelj)} | ${esc(r.oib)} | ${esc(r.vrsta)} | ${fmtEur(r.iznos)} | ${esc(r.status)} |