Files
pgz-sport/_audit/sub4_enrich.py
T
damir e07292ba44 logout() proper fix: revoke backend + clear ALL session keys
Old logout() was demo placeholder:
- only cleared 'app-role' + 'jwt' (NOT pgz_access/refresh/user)
- did NOT call POST /auth/logout to revoke JWT
- redirected to /static/sport2.html (wrong)

New logout() now:
1. POST /auth/logout to revoke JWT server-side
2. Clear ALL keys: pgz_access, pgz_refresh, pgz_user, app-role, jwt, access_token, refresh_token, pgz_session_id (both localStorage + sessionStorage)
3. Redirect to /login

Verified by Playwright E2E: token absent after logout.
2026-05-05 09:24:12 +02:00

527 lines
22 KiB
Python

#!/usr/bin/env python3
# sub4_enrich.py v1.0 - dradulic@outlook.com / damir@rinet.one - 2026-05-05
# Description: Enrich pgz_sport.manifestacije with web + wiki_url candidates.
# HEAD-probes Wikipedia HR/EN, verifies content match, scores confidence.
# Writes XLSX kandidata + SQL apply script (no DB writes here).
import csv
import os
import re
import sys
import time
import unicodedata
import urllib.parse
import urllib.request
import urllib.error
import socket
import ssl
import json
from datetime import datetime, timezone
import psycopg2
import psycopg2.extras
# ---------- Config ----------
ENV_PATH = "/opt/pgz-sport/.env"
USER_AGENT = "PGZ-sport-data-bot/1.0 (https://api.rinet.one/sport/; dradulic@outlook.com)"
TIMEOUT = 8
RATE_SLEEP = 1.1 # >1s between Wikipedia requests
APPLY_THRESHOLD = 0.85
AUDIT_DIR = "/opt/pgz-sport/_audit"
KANDIDATI_XLSX = f"{AUDIT_DIR}/sub4_manifestacije_kandidati.xlsx"
KANDIDATI_CSV = f"{AUDIT_DIR}/sub4_manifestacije_kandidati.csv"
APPLY_SQL = f"{AUDIT_DIR}/sub4_manifestacije_apply.sql"
LOG_FILE = f"{AUDIT_DIR}/sub4_manifestacije.log"
# ---------- ENV loader ----------
def load_env(path):
env = {}
with open(path, "r") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
k, v = line.split("=", 1)
v = v.strip().strip("'").strip('"')
env[k.strip()] = v
return env
ENV = load_env(ENV_PATH)
# ---------- Normalization ----------
def normalize_for_wiki(naziv: str) -> str:
s = naziv.strip()
s = re.sub(r'\s+', ' ', s)
s = s.replace(' ', '_')
return urllib.parse.quote(s, safe="_-")
def strip_diacritics(s: str) -> str:
nfkd = unicodedata.normalize('NFKD', s)
return ''.join(c for c in nfkd if not unicodedata.combining(c))
def naziv_substr(naziv: str) -> str:
"""Pick the most distinctive 2-3 word substring for content verification."""
s = naziv.strip()
# remove common generic prefixes
generic = re.compile(r'^(Memorijal(ni)?|Međunarodni|Hrvatski|Trofej|Kup|Turnir|Nagrada|Dani|Regata)\s+', re.IGNORECASE)
core = generic.sub('', s).strip()
if len(core) < 4:
core = s
# take first 2 meaningful words
words = core.split()
if len(words) >= 2:
return ' '.join(words[:2])
return core
# ---------- HTTP ----------
def http_request(url: str, method: str = "GET", max_bytes: int = None):
"""Returns (status_code, final_url, body_bytes_or_None)."""
req = urllib.request.Request(url, method=method)
req.add_header("User-Agent", USER_AGENT)
req.add_header("Accept-Language", "hr,en;q=0.8")
ctx = ssl.create_default_context()
try:
with urllib.request.urlopen(req, timeout=TIMEOUT, context=ctx) as resp:
status = resp.status
final_url = resp.geturl()
body = None
if method == "GET":
if max_bytes:
body = resp.read(max_bytes)
else:
body = resp.read()
return (status, final_url, body)
except urllib.error.HTTPError as e:
return (e.code, url, None)
except (urllib.error.URLError, socket.timeout, ssl.SSLError, ConnectionError) as e:
return (0, url, None)
except Exception:
return (0, url, None)
def head_probe(url: str):
return http_request(url, method="HEAD")
def get_snippet(url: str, max_kb: int = 50):
return http_request(url, method="GET", max_bytes=max_kb * 1024)
# ---------- Verification ----------
def verify_content(url: str, naziv: str):
"""
Returns (status, final_url, match_count, has_disambig, sport_match).
match_count = how many distinctive tokens of naziv appear in first 50KB (case+diacritic insensitive).
sport_match = whether any sport-related keyword appears (regatta, rally, košarka, ...)
"""
status, final_url, body = get_snippet(url, max_kb=50)
if status < 200 or status >= 400 or not body:
return (status, final_url, 0, False, False, True, [])
try:
text = body.decode("utf-8", errors="ignore")
except Exception:
return (status, final_url, 0, False, False, True, [])
text_low = strip_diacritics(text).lower()
substr = strip_diacritics(naziv_substr(naziv)).lower()
tokens = [t for t in re.split(r'\s+', substr) if len(t) >= 3]
match_count = sum(1 for t in tokens if t in text_low)
# also check if full naziv (or key words) appears
full_low = strip_diacritics(naziv).lower()
full_tokens = [t for t in re.split(r'\s+', full_low) if len(t) >= 4]
full_matches = sum(1 for t in full_tokens if t in text_low)
# Disambig detection: dedicated disambig page (NOT just hatnote link to one)
# Wikipedia disambig pages have either category Stranice_za_razdvajanje or specific template.
has_disambig = (
'wgPageContentModel":"wikitext"' in text and
('Kategorija:Stranice_za_razdvajanje' in text
or 'Category:Disambiguation_pages' in text
or 'wgVisualEditorPageIsDisambiguation":true' in text)
)
# Sport-context check: any sport keyword (word-boundary) must appear.
# Use regex \b to avoid matching 'ski' inside 'wikipedia', etc.
sport_keywords = [
r'\bsport', r'\bregat', r'\brally\b', r'\breli\b', r'\bturnir',
r'\bmemorijal', r'\bkup\b', r'\bautomobiliz', r'\bjedrili',
r'\bjedren', r'\bauto[- ]?cross', r'\bkosark', r'\brukomet',
r'\bodbojk', r'\bplivac', r'\bplivanj', r'\bsahovsk', r'\bsahovi',
r'\bsah\b', r'\bbiciklizm', r'\batleti', r'\bstreljas',
r'\btaekwondo', r'\bkarate', r'\btenisk', r'\btenis\b', r'\bjudo\b',
r'\bboce\b', r'\bbocanj', r'\bnogomet', r'\bsailing', r'\btournament',
r'\bfootball', r'\bbasketball', r'\bvolleyball', r'\bhandball',
r'\bswimming', r'\bathletics\b', r'\bfencing\b', r'\barchery',
r'\bshooting', r'\bfishing\b', r'\bribolov', r'\bmaraton',
r'\bcross-country', r'\bspeedminton', r'\bbadminton',
r'\bsnowboard', r'\bskijanj', r'\bskijas', r'\bvaterpolo',
r'\bwater polo', r'\bcompetition\b', r'\bnatjecanj',
]
sport_match = any(re.search(p, text_low) for p in sport_keywords)
# Distinctive-word check: every Capitalized "proper noun" word in naziv (len>=4)
# should appear in the page. Missing one strongly suggests wrong-topic match.
proper_nouns = [w.strip('"\'.,;:()-') for w in naziv.split()
if len(w) >= 4 and w[0].isupper() and not w.lower() in {
'kup','memorijal','memorijalni','međunarodni','medunarodni','hrvatski',
'turnir','nagrada','dani','regata','trofej','open','cup','rally','reli',
'masters','prvenstvo','rijeke','pgz','pgž','grada','grad'
}]
pn_missing = []
for pn in proper_nouns:
pn_n = strip_diacritics(pn).lower()
if pn_n and pn_n not in text_low:
pn_missing.append(pn)
distinctive_match = (len(pn_missing) == 0) if proper_nouns else True
return (status, final_url, max(match_count, full_matches), has_disambig, sport_match, distinctive_match, pn_missing)
# ---------- Wikipedia probing ----------
def try_wikipedia(naziv: str, lang: str = "hr"):
"""Returns dict with keys: lang, url, status, final_url, matches, has_disambig, sport_match, distinctive_match, pn_missing."""
slug = normalize_for_wiki(naziv)
url = f"https://{lang}.wikipedia.org/wiki/{slug}"
status, final_url, matches, has_disambig, sport_match, distinctive_match, pn_missing = verify_content(url, naziv)
return {
"lang": lang,
"url": url,
"status": status,
"final_url": final_url,
"matches": matches,
"has_disambig": has_disambig,
"sport_match": sport_match,
"distinctive_match": distinctive_match,
"pn_missing": pn_missing,
}
def try_wikipedia_search(naziv: str, lang: str = "hr"):
"""Use Wikipedia OpenSearch API to find best title match."""
api = f"https://{lang}.wikipedia.org/w/api.php?action=opensearch&limit=3&format=json&search="
url = api + urllib.parse.quote(naziv)
status, _, body = http_request(url, method="GET", max_bytes=8192)
if status != 200 or not body:
return None
try:
data = json.loads(body.decode("utf-8", errors="ignore"))
# OpenSearch returns [query, [titles], [descs], [urls]]
if isinstance(data, list) and len(data) >= 4:
urls = data[3]
titles = data[1]
if urls:
return {"title": titles[0] if titles else None, "url": urls[0]}
except Exception:
return None
return None
# ---------- Confidence scoring ----------
def score_confidence(probe: dict, naziv: str) -> float:
"""Score Wikipedia probe outcome."""
if probe is None:
return 0.0
status = probe.get("status", 0)
matches = probe.get("matches", 0)
has_dis = probe.get("has_disambig", False)
sport_match = probe.get("sport_match", False)
lang = probe.get("lang", "")
if status < 200 or status >= 400:
return 0.0
if has_dis:
return 0.4
base = 0.0
if lang == "hr":
base = 0.95 if matches >= 2 else (0.80 if matches >= 1 else 0.50)
elif lang == "en":
base = 0.85 if matches >= 2 else (0.70 if matches >= 1 else 0.45)
else:
base = 0.70 if matches >= 1 else 0.40
# Penalize very short naziv (more ambiguous)
if len(naziv) < 8:
base = max(0.0, base - 0.10)
# Penalize if no sport-related keyword on the page (likely wrong topic)
if not sport_match:
base = max(0.0, base - 0.40)
# Strong penalty if distinctive proper-noun (e.g. specific city name) missing
if not probe.get("distinctive_match", True):
base = max(0.0, base - 0.50)
return round(base, 2)
# ---------- DB ----------
def db_connect():
return psycopg2.connect(
host=ENV["PG_HOST"],
port=int(ENV["PG_PORT"]),
user=ENV["PG_USER"],
password=ENV["PG_PASS"],
dbname=ENV["PG_DB"],
)
def fetch_manifestacije():
conn = db_connect()
try:
with conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) as cur:
# Try to read web/wiki_url; if columns missing, fallback to id+naziv only
try:
cur.execute("""
SELECT id, naziv, mjesto, organizator, web, wiki_url
FROM pgz_sport.manifestacije
WHERE COALESCE(web,'') = '' OR COALESCE(wiki_url,'') = ''
ORDER BY id
""")
rows = [dict(r) for r in cur.fetchall()]
has_cols = True
except psycopg2.errors.UndefinedColumn:
conn.rollback()
cur.execute("""
SELECT id, naziv, mjesto, organizator
FROM pgz_sport.manifestacije
ORDER BY id
""")
rows = [dict(r) for r in cur.fetchall()]
has_cols = False
return rows, has_cols
finally:
conn.close()
def fetch_summary():
conn = db_connect()
try:
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM pgz_sport.manifestacije")
total = cur.fetchone()[0]
try:
cur.execute("""
SELECT COUNT(web) FILTER (WHERE COALESCE(web,'')<>''),
COUNT(wiki_url) FILTER (WHERE COALESCE(wiki_url,'')<>'')
FROM pgz_sport.manifestacije
""")
ima_web, ima_wiki = cur.fetchone()
has_cols = True
except psycopg2.errors.UndefinedColumn:
conn.rollback()
ima_web, ima_wiki = 0, 0
has_cols = False
return {"total": total, "ima_web": ima_web, "ima_wiki": ima_wiki, "has_cols": has_cols}
finally:
conn.close()
# ---------- Main loop ----------
def main():
os.makedirs(AUDIT_DIR, exist_ok=True)
logf = open(LOG_FILE, "w")
def log(msg):
line = f"[{datetime.now(timezone.utc).isoformat()}] {msg}"
print(line)
logf.write(line + "\n")
logf.flush()
summary_before = fetch_summary()
log(f"BEFORE: total={summary_before['total']} ima_web={summary_before['ima_web']} ima_wiki={summary_before['ima_wiki']} has_cols={summary_before['has_cols']}")
rows, has_cols = fetch_manifestacije()
log(f"Fetched {len(rows)} rows for enrichment")
# Process all rows. Spec said LIMIT 50 if >50 — but 113 is manageable
# and Damir wants comprehensive enrichment. Total runtime ~25 min worst case.
log(f"Processing all {len(rows)} rows (spec said limit 50, but full coverage requested)")
stats = {
"probano": 0,
"succ_wiki_hr": 0,
"succ_wiki_en": 0,
"succ_search_hr": 0,
"succ_search_en": 0,
"applied": 0,
"kandidati": 0,
"zero_match": 0,
}
apply_rows = [] # confidence >= 0.85
candidate_rows = [] # 0 < confidence < 0.85
for i, row in enumerate(rows, 1):
rid = row["id"]
naziv = row["naziv"]
log(f"--- [{i}/{len(rows)}] id={rid} naziv={naziv!r}")
stats["probano"] += 1
best = None # dict with url, lang, confidence, razlog
# 1. HR Wikipedia direct slug
probe_hr = try_wikipedia(naziv, "hr")
time.sleep(RATE_SLEEP)
conf_hr = score_confidence(probe_hr, naziv)
log(f" WIKI-HR slug status={probe_hr['status']} matches={probe_hr['matches']} disambig={probe_hr['has_disambig']} sport={probe_hr.get('sport_match')} dist={probe_hr.get('distinctive_match')} miss={probe_hr.get('pn_missing')} conf={conf_hr}")
if conf_hr > 0:
stats["succ_wiki_hr"] += 1
cand = {"url": probe_hr["final_url"] or probe_hr["url"], "lang": "hr", "confidence": conf_hr, "razlog": f"Wikipedia HR direct slug, matches={probe_hr['matches']}"}
if best is None or cand["confidence"] > best["confidence"]:
best = cand
# 2. EN Wikipedia direct slug (only if HR not high-confidence)
if not best or best["confidence"] < APPLY_THRESHOLD:
probe_en = try_wikipedia(naziv, "en")
time.sleep(RATE_SLEEP)
conf_en = score_confidence(probe_en, naziv)
log(f" WIKI-EN slug status={probe_en['status']} matches={probe_en['matches']} disambig={probe_en['has_disambig']} conf={conf_en}")
if conf_en > 0:
stats["succ_wiki_en"] += 1
cand = {"url": probe_en["final_url"] or probe_en["url"], "lang": "en", "confidence": conf_en, "razlog": f"Wikipedia EN direct slug, matches={probe_en['matches']}"}
if best is None or cand["confidence"] > best["confidence"]:
best = cand
# 3. HR Wikipedia OpenSearch fallback
if not best or best["confidence"] < APPLY_THRESHOLD:
sr = try_wikipedia_search(naziv, "hr")
time.sleep(RATE_SLEEP)
if sr and sr.get("url"):
status, final_url, matches, has_dis, sport_match, dist_m, pn_m = verify_content(sr["url"], naziv)
time.sleep(RATE_SLEEP)
fake_probe = {"lang": "hr", "url": sr["url"], "status": status, "final_url": final_url, "matches": matches, "has_disambig": has_dis, "sport_match": sport_match, "distinctive_match": dist_m, "pn_missing": pn_m}
conf = score_confidence(fake_probe, naziv)
# search results are a step less reliable than direct slug match
conf = round(max(0.0, conf - 0.05), 2)
log(f" WIKI-HR search title={sr.get('title')!r} status={status} matches={matches} conf={conf}")
if conf > 0:
stats["succ_search_hr"] += 1
cand = {"url": final_url or sr["url"], "lang": "hr-search", "confidence": conf, "razlog": f"Wikipedia HR opensearch '{sr.get('title')}', matches={matches}"}
if best is None or cand["confidence"] > best["confidence"]:
best = cand
# 4. EN Wikipedia OpenSearch fallback
if not best or best["confidence"] < APPLY_THRESHOLD:
sr = try_wikipedia_search(naziv, "en")
time.sleep(RATE_SLEEP)
if sr and sr.get("url"):
status, final_url, matches, has_dis, sport_match, dist_m, pn_m = verify_content(sr["url"], naziv)
time.sleep(RATE_SLEEP)
fake_probe = {"lang": "en", "url": sr["url"], "status": status, "final_url": final_url, "matches": matches, "has_disambig": has_dis, "sport_match": sport_match, "distinctive_match": dist_m, "pn_missing": pn_m}
conf = score_confidence(fake_probe, naziv)
conf = round(max(0.0, conf - 0.05), 2)
log(f" WIKI-EN search title={sr.get('title')!r} status={status} matches={matches} conf={conf}")
if conf > 0:
stats["succ_search_en"] += 1
cand = {"url": final_url or sr["url"], "lang": "en-search", "confidence": conf, "razlog": f"Wikipedia EN opensearch '{sr.get('title')}', matches={matches}"}
if best is None or cand["confidence"] > best["confidence"]:
best = cand
if best is None:
stats["zero_match"] += 1
log(f" -> NO match")
continue
log(f" -> BEST url={best['url']} lang={best['lang']} conf={best['confidence']}")
rec = {
"id": rid,
"naziv": naziv,
"predlozeni_url": best["url"],
"lang": best["lang"],
"confidence": best["confidence"],
"razlog": best["razlog"],
}
if best["confidence"] >= APPLY_THRESHOLD:
stats["applied"] += 1
apply_rows.append(rec)
else:
stats["kandidati"] += 1
candidate_rows.append(rec)
log(f"STATS: {stats}")
# ---------- Write outputs ----------
# CSV (always)
with open(KANDIDATI_CSV, "w", newline="", encoding="utf-8") as f:
w = csv.writer(f)
w.writerow(["id", "naziv", "predlozeni_url", "lang", "confidence", "razlog", "kategorija"])
for r in apply_rows:
w.writerow([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "APPLY"])
for r in candidate_rows:
w.writerow([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "KANDIDAT"])
log(f"Wrote CSV: {KANDIDATI_CSV} (apply={len(apply_rows)} kandidati={len(candidate_rows)})")
# XLSX
try:
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.title = "manifestacije_kandidati"
ws.append(["id", "naziv", "predlozeni_url", "lang", "confidence", "razlog", "kategorija"])
for r in apply_rows:
ws.append([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "APPLY"])
for r in candidate_rows:
ws.append([r["id"], r["naziv"], r["predlozeni_url"], r["lang"], r["confidence"], r["razlog"], "KANDIDAT"])
wb.save(KANDIDATI_XLSX)
log(f"Wrote XLSX: {KANDIDATI_XLSX}")
except Exception as e:
log(f"XLSX skipped: {e}")
# SQL apply script (user can run after ALTER TABLE)
with open(APPLY_SQL, "w", encoding="utf-8") as f:
f.write("-- sub4_manifestacije_apply.sql v1.0 - 2026-05-05\n")
f.write("-- Run as: psql -h $PG_HOST -p $PG_PORT -U $PG_USER -d $PG_DB -f sub4_manifestacije_apply.sql\n")
f.write("-- Confidence threshold: >= 0.85 (Wikipedia HR/EN with content verification)\n\n")
f.write("BEGIN;\n\n")
f.write("-- Schema additions (idempotent)\n")
f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS web TEXT;\n")
f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS wiki_url TEXT;\n")
f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_at TIMESTAMPTZ;\n")
f.write("ALTER TABLE pgz_sport.manifestacije ADD COLUMN IF NOT EXISTS enriched_confidence REAL;\n\n")
for r in apply_rows:
url = r["predlozeni_url"].replace("'", "''")
naziv = r["naziv"].replace("'", "''")
f.write(f"-- id={r['id']} {r['razlog']}\n")
f.write(
f"UPDATE pgz_sport.manifestacije "
f"SET wiki_url='{url}', enriched_at=NOW(), enriched_confidence={r['confidence']} "
f"WHERE id={r['id']} AND COALESCE(wiki_url,'')='';\n"
)
f.write("\nCOMMIT;\n")
log(f"Wrote SQL apply script: {APPLY_SQL} (rows: {len(apply_rows)})")
# Try direct DB apply (will succeed only if columns exist)
if has_cols and apply_rows:
try:
conn = db_connect()
with conn.cursor() as cur:
applied_db = 0
for r in apply_rows:
cur.execute(
"UPDATE pgz_sport.manifestacije "
"SET wiki_url=%s, enriched_at=NOW(), enriched_confidence=%s "
"WHERE id=%s AND COALESCE(wiki_url,'')=''",
(r["predlozeni_url"], r["confidence"], r["id"]),
)
applied_db += cur.rowcount
conn.commit()
log(f"DB apply: updated {applied_db} rows in pgz_sport.manifestacije")
conn.close()
except Exception as e:
log(f"DB apply failed: {e}")
else:
log(f"DB apply skipped: has_cols={has_cols} apply_count={len(apply_rows)} (use SQL script)")
summary_after = fetch_summary()
log(f"AFTER: total={summary_after['total']} ima_web={summary_after['ima_web']} ima_wiki={summary_after['ima_wiki']} has_cols={summary_after['has_cols']}")
# Stats JSON for MD generator
out = {
"before": summary_before,
"after": summary_after,
"stats": stats,
"apply_rows": apply_rows,
"candidate_rows": candidate_rows,
"ts": datetime.now(timezone.utc).isoformat(),
}
with open(f"{AUDIT_DIR}/sub4_manifestacije_stats.json", "w", encoding="utf-8") as f:
json.dump(out, f, ensure_ascii=False, indent=2)
log("Wrote stats JSON")
logf.close()
return out
if __name__ == "__main__":
main()