feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
Executable
+194
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""results_multilevel.py — daily scrape of league match results across HR sport
|
||||
federations (HNS / HKS / HRS).
|
||||
|
||||
This file is the **scaffold**: a real production-grade per-match scraper for
|
||||
each federation is per-federation HTML/embed/API work. The structure here
|
||||
makes adding each federation a single function — and lays down logging,
|
||||
dedup-by-external-id, klub-name → klub_id resolution, and CLI flags.
|
||||
|
||||
Scope today (2026-05-09):
|
||||
- HNS (nogomet): TODO — likely source https://hns-cff.hr/ + per-liga pages
|
||||
- HKS (košarka): partial — could borrow Genius Sports embed used by
|
||||
/opt/pgz-sport/scrapers/hks_scraper.py for standings; that scraper does
|
||||
*standings*, not per-match results
|
||||
- HRS (rukomet): TODO — likely source https://www.hrs.hr/
|
||||
|
||||
Exit 0 on partial coverage; per-federation failure logs to stderr.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse, json, logging, os, sys, time
|
||||
from datetime import datetime, timezone, date
|
||||
from typing import Optional
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [results_ml] %(message)s")
|
||||
log = logging.getLogger("results_ml")
|
||||
|
||||
DSN = os.environ.get(
|
||||
"RINET_DSN",
|
||||
f"host=127.0.0.1 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}",
|
||||
)
|
||||
UA = "RiNET-Civic/1.0 (https://rinet.one) results scraper"
|
||||
RATE_LIMIT_SEC = 1.2 # polite between requests within one federation
|
||||
|
||||
|
||||
# ─── DB helpers ────────────────────────────────────────────────────────
|
||||
def db_conn():
|
||||
return psycopg2.connect(DSN)
|
||||
|
||||
|
||||
def upsert_match(cur, row: dict) -> str:
|
||||
"""Insert if (federation, external_id) is new; else update score+status.
|
||||
|
||||
Returns 'inserted' | 'updated' | 'skipped' | 'error'.
|
||||
"""
|
||||
if not row.get("federation") or not row.get("external_id"):
|
||||
return "skipped"
|
||||
try:
|
||||
cur.execute(
|
||||
"""
|
||||
INSERT INTO pgz_sport.results
|
||||
(sport, federation, liga, sezona, match_date, kolo,
|
||||
home_team, away_team, home_klub_id, away_klub_id,
|
||||
home_score, away_score, status, source_url, external_id, raw_payload)
|
||||
VALUES (%(sport)s, %(federation)s, %(liga)s, %(sezona)s,
|
||||
%(match_date)s, %(kolo)s,
|
||||
%(home_team)s, %(away_team)s, %(home_klub_id)s, %(away_klub_id)s,
|
||||
%(home_score)s, %(away_score)s,
|
||||
COALESCE(%(status)s,'final'),
|
||||
%(source_url)s, %(external_id)s,
|
||||
%(raw_payload)s::jsonb)
|
||||
ON CONFLICT (federation, external_id) DO UPDATE SET
|
||||
home_score = EXCLUDED.home_score,
|
||||
away_score = EXCLUDED.away_score,
|
||||
status = EXCLUDED.status,
|
||||
raw_payload= EXCLUDED.raw_payload,
|
||||
scraped_at = now()
|
||||
RETURNING (xmax = 0) AS inserted
|
||||
""",
|
||||
{**row,
|
||||
"raw_payload": json.dumps(row.get("raw_payload") or {}, ensure_ascii=False)},
|
||||
)
|
||||
was_inserted = cur.fetchone()["inserted"]
|
||||
return "inserted" if was_inserted else "updated"
|
||||
except Exception as e:
|
||||
log.warning(f"upsert_match err for {row.get('federation')}/{row.get('external_id')}: {e}")
|
||||
return "error"
|
||||
|
||||
|
||||
def resolve_klub(cur, name: str, sport: str) -> Optional[int]:
|
||||
"""Best-effort name → klubovi.id. Falls back to None."""
|
||||
if not name:
|
||||
return None
|
||||
cur.execute(
|
||||
"""SELECT id FROM pgz_sport.klubovi
|
||||
WHERE sport ILIKE %s AND naziv ILIKE %s
|
||||
ORDER BY length(naziv) ASC LIMIT 1""",
|
||||
(sport, f"%{name.strip()[:40]}%"),
|
||||
)
|
||||
r = cur.fetchone()
|
||||
return r["id"] if r else None
|
||||
|
||||
|
||||
# ─── Per-federation scrapers ────────────────────────────────────────────
|
||||
def scrape_hns(cur, sezona: str = "2025/26") -> dict:
|
||||
"""HNS (nogomet). TODO: walk per-liga pages on hns-cff.hr.
|
||||
|
||||
Plausible sources (verified in earlier ingests):
|
||||
* https://hns-cff.hr/natjecanja/ (lige listing)
|
||||
* https://semafor.hns.family/ (per-match drill-down — this is what
|
||||
hns_master_harvester.py uses for player seasons)
|
||||
|
||||
Per-match parsing for SEZONA-by-SEZONA scrape is non-trivial because
|
||||
each liga page is a different layout. Implementation deferred.
|
||||
"""
|
||||
log.warning("scrape_hns: NOT YET IMPLEMENTED — stub returns 0 rows")
|
||||
return {"inserted": 0, "updated": 0, "errors": 0, "status": "stub"}
|
||||
|
||||
|
||||
def scrape_hks(cur, sezona: str = "2025/26") -> dict:
|
||||
"""HKS (košarka). Genius Sports embed exposes JSON for fixtures.
|
||||
|
||||
Existing /opt/pgz-sport/scrapers/hks_scraper.py reads the *standings*
|
||||
embed at:
|
||||
https://hosted.dcd.shared.geniussports.com/embednf/HKS/en/competition/{ID}/standings
|
||||
The fixtures/results embed has a different path
|
||||
(.../competition/{ID}/fixtures or .../matches). Per-match parsing
|
||||
deferred.
|
||||
"""
|
||||
log.warning("scrape_hks: NOT YET IMPLEMENTED — stub returns 0 rows")
|
||||
return {"inserted": 0, "updated": 0, "errors": 0, "status": "stub"}
|
||||
|
||||
|
||||
def scrape_hrs(cur, sezona: str = "2025/26") -> dict:
|
||||
"""HRS (rukomet). No existing scraper — source TBD.
|
||||
|
||||
Likely candidates:
|
||||
* https://www.hrs.hr/ (federation site)
|
||||
* Per-liga pages once located
|
||||
"""
|
||||
log.warning("scrape_hrs: NOT YET IMPLEMENTED — stub returns 0 rows")
|
||||
return {"inserted": 0, "updated": 0, "errors": 0, "status": "stub"}
|
||||
|
||||
|
||||
# ─── orchestration ─────────────────────────────────────────────────────
|
||||
FEDS = {
|
||||
"hns": ("nogomet", scrape_hns),
|
||||
"hks": ("kosarka", scrape_hks),
|
||||
"hrs": ("rukomet", scrape_hrs),
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--feds", default="hns,hks,hrs",
|
||||
help="comma list of federation keys to run")
|
||||
ap.add_argument("--sezona", default="2025/26")
|
||||
ap.add_argument("--dry-run", action="store_true",
|
||||
help="run scrapers but do not COMMIT")
|
||||
args = ap.parse_args()
|
||||
|
||||
started = datetime.now(timezone.utc)
|
||||
summary = {"started": started.isoformat(), "sezona": args.sezona, "feds": {}}
|
||||
|
||||
conn = db_conn()
|
||||
conn.autocommit = False
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
try:
|
||||
for fed in [f.strip().lower() for f in args.feds.split(",") if f.strip()]:
|
||||
if fed not in FEDS:
|
||||
log.warning(f"unknown federation: {fed}")
|
||||
continue
|
||||
sport, fn = FEDS[fed]
|
||||
log.info(f"scraping {fed} ({sport})")
|
||||
t0 = time.time()
|
||||
try:
|
||||
res = fn(cur, sezona=args.sezona)
|
||||
except Exception as e:
|
||||
res = {"inserted": 0, "updated": 0, "errors": 1,
|
||||
"status": f"crash:{type(e).__name__}: {str(e)[:200]}"}
|
||||
res["elapsed_s"] = round(time.time() - t0, 1)
|
||||
summary["feds"][fed] = res
|
||||
|
||||
if args.dry_run:
|
||||
conn.rollback()
|
||||
log.info("dry-run: rolled back")
|
||||
else:
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
summary["ended"] = datetime.now(timezone.utc).isoformat()
|
||||
print(json.dumps(summary, ensure_ascii=False, indent=2))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user