feat: /api/v2/analiza/* endpoints - sport analytics backend
This commit is contained in:
@@ -1,4 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv('/opt/rinet-gpu/.env.master')
|
||||
# auto-added by patch_scrapers_with_dotenv.sh
|
||||
"""
|
||||
HNS Master Harvester — Playwright-based scrape semafor.hns.family
|
||||
─────────────────────────────────────────────────────────────────
|
||||
@@ -18,7 +21,7 @@ from psycopg2.extras import RealDictCursor, execute_values
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
DSN = os.getenv("RINET_DSN",
|
||||
"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7")
|
||||
f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}")
|
||||
TG = os.getenv("TG_BOT_TOKEN", "8535797835:AAFItT-92jzZ9NWFafLxn0dLa1_n2s-JE5Y")
|
||||
TG_CHAT = os.getenv("TG_CHAT", "7969491558")
|
||||
LOG = open(f"/var/log/pgz-sport-debug/hns_harvester_{datetime.now().strftime('%Y%m%d_%H%M')}.log", "a")
|
||||
@@ -210,12 +213,30 @@ def upsert_clan(conn, klub_id, player_data):
|
||||
def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
if not seasons: return 0
|
||||
rows = []
|
||||
skipped = 0
|
||||
# Reject rows where klub_naziv is obviously a misparsed HTML stat-block
|
||||
# (the parser at scrape_player_full() can produce these when a <table> row
|
||||
# has fewer cells than the header — dict(zip(...)) silently drops, leaving
|
||||
# whole-block dumps or bare numbers in the value).
|
||||
_BAD_PREFIXES = ('STATISTIKA', 'NASTUPI', 'ZAPOČEO', 'ZAMJENA',
|
||||
'POGOTCI', 'ŽUTI', 'CRVENI', 'UKUPNO', 'SUPERSPORT')
|
||||
def _looks_like_garbage(klub_text):
|
||||
if not klub_text: return True
|
||||
t = klub_text.strip()
|
||||
if not t: return True # whitespace only
|
||||
if re.match(r'^\d+$', t): return True # bare number (year, jersey #)
|
||||
if t.count('\n') >= 2: return True # multi-line label dump
|
||||
u = t.upper()
|
||||
return any(u.startswith(p) for p in _BAD_PREFIXES)
|
||||
for s in seasons:
|
||||
sezona = s.get('sezona', '')
|
||||
if not sezona: continue
|
||||
# Try extract klub iz row
|
||||
klub = next((v for k,v in s.items() if 'lub' in k.lower()), '')
|
||||
natjecanje = next((v for k,v in s.items() if 'atjec' in k.lower() or 'liga' in k.lower()), '')
|
||||
if _looks_like_garbage(klub):
|
||||
skipped += 1
|
||||
continue
|
||||
def num(key):
|
||||
for k in s.keys():
|
||||
if key in k.lower():
|
||||
@@ -227,6 +248,8 @@ def upsert_seasons(conn, hns_id, clan_id, seasons):
|
||||
num('nastup'), num('start'), num('zamj'),
|
||||
num('gol'), num('asist'), num('žut'), num('crv'), num('minut')
|
||||
))
|
||||
if skipped:
|
||||
print(f'[hns_master_harvester] upsert_seasons: skipped {skipped} garbage rows for hns_id={hns_id}')
|
||||
with conn.cursor() as cur:
|
||||
execute_values(cur, """
|
||||
INSERT INTO pgz_sport.hns_player_seasons
|
||||
|
||||
Reference in New Issue
Block a user