]*>(.+?)(?=

for current visible # Actually each slide contains its own competition details # Better approach: extract all competition name spans in order comp_names = re.findall(r'

]>\s([^<]+?)\s*<', html) # And champion images for Vaterpolist (medal__name) medal_names = re.findall(r'

]>\s([^<]+?)\s*<', html) # And years years = re.findall(r'

]>\s([^<]+?)\s*<', html) # Or in name spans years_alt = re.findall(r'

]>\s([0-9]{4}\./[0-9]{4}\.)\s*<', html) # Also single-year format (Trofej, Vaterpolist often have just single year) years_single = re.findall(r'

]>\s([0-9]{4}\.)\s*<', html) all_years = list(set(years + years_alt + years_single)) all_years_sorted = sorted([y.strip() for y in all_years if y.strip()]) print(f" Champions: {len(comp_names)}, Medal names: {len(medal_names)}, Years: {len(all_years_sorted)}", flush=True) if comp_names: print(f" Sample champ: {comp_names[:3]}") if medal_names: print(f" Sample medal: {medal_names[:3]}") if all_years_sorted: print(f" Years range: {all_years_sorted[0]} → {all_years_sorted[-1]}") # The names may be aligned with years sequentially # Use whichever list has names - prefer comp_names (championships) else medal_names (individual awards) names = comp_names if comp_names else medal_names names = [n.strip() for n in names if n.strip()] # The champions in HTML order represent the seasons in display order # Map them to years - assume index alignment with sorted years if same length # Otherwise, the page shows multiple slides — same name may repeat # For each non-empty name, create entry # Best guess: names list and years list are PARALLEL (same length, in order on page) # Pages show all-time history, so years_alt (with format) is most reliable # Smart: if len(names) matches len(all_years_sorted), pair them # Else create entries with name+year separately and link by index # Even better: each "slide" block contains 1 name + 1 year contextually nearby # Find pairs by extracting full slides and matching internal patterns slide_pat = re.compile( r'(?:

]>\s([^<]+?)\s*<|

]>\s([^<]+?)\s<)' r'.?(?:

]>\s([0-9./]+)|

]>\s([0-9./]+))', re.DOTALL ) # That regex too complex - use simpler split approach # Split by slide divs all_records = [] slide_blocks = re.split(r'
', html) for blk in slide_blocks[1:]: # Find name (champion or medal_name) name_m = re.search(r'class="riznicacompetitionname"[^>]>\s([^<]+?)\s<', blk) if not name_m: name_m = re.search(r'class="riznicamedalname"[^>]>\s([^<]+?)\s<', blk) year_m = re.search(r'class="riznicacompetitionscurrent"[^>]>\s([0-9./]+)', blk) if not year_m: year_m = re.search(r'class="riznicacompetitionsname(?:[^"])"[^>]>\s([0-9./]+?)\s<', blk) # Image img_m = re.search(r' 0: inserted_total += 1 except Exception as e: pass all_winners.append({"category": label, "count": len(all_records), "records": all_records}) except Exception as e: print(f" EXC: {e}") print(f"\n\n=== TOTAL INSERTED: {inserted_total} ===") # Audit log cu.execute("""INSERT INTO pgz_sport.audit_feed (table_name, action, source, source_url, details) VALUES ('klub_sezona', 'hvs_riznica_scrape', 'hvs_riznica', 'https://hvs.hr/riznica/', %s::jsonb)""", (f'{{"inserted":{inserted_total},"categories":{len(CATEGORIES)}}}',)) # Top winners cu.execute("""SELECT klub_naziv, count() AS naslova FROM pgz_sport.klub_sezona WHERE source='hvs_riznica' GROUP BY klub_naziv ORDER BY count() DESC LIMIT 15""") print("\n=== TOP HVS prvaci/medalisti ===") for r in cu.fetchall(): print(f" {r[1]:>3}× {r[0]}") # PGŽ-relevant cu.execute("""SELECT k.naziv, count(ks.) AS naslova FROM pgz_sport.klub_sezona ks JOIN pgz_sport.klubovi k ON k.id = ks.klub_id WHERE ks.source='hvs_riznica' AND ks.klub_id IS NOT NULL GROUP BY k.naziv ORDER BY count() DESC""") print("\n=== PGŽ klubovi sa HVS naslovima ===") for r in cu.fetchall(): print(f" {r[1]:>3}× {r[0]}") conn.close()

]*>\s*([^<]+?)\s*<', html) # And champion images for Vaterpolist (medal__name) medal_names = re.findall(r'

]*>\s*([^<]+?)\s*<', html) # And years years = re.findall(r'

]*>\s*([^<]+?)\s*<', html) # Or in name spans years_alt = re.findall(r'

]*>\s*([0-9]{4}\./[0-9]{4}\.)\s*<', html) # Also single-year format (Trofej, Vaterpolist often have just single year) years_single = re.findall(r'

]*>\s*([^<]+?)\s*<|

]*>\s*([^<]+?)\s*<)' r'.*?(?:

]*>\s*([0-9./]+)|

]>\s([^<]+?)\s*<', html) # And champion images for Vaterpolist (medal__name) medal_names = re.findall(r'

]>\s([^<]+?)\s*<', html) # And years years = re.findall(r'

]>\s([^<]+?)\s*<', html) # Or in name spans years_alt = re.findall(r'

]>\s([0-9]{4}\./[0-9]{4}\.)\s*<', html) # Also single-year format (Trofej, Vaterpolist often have just single year) years_single = re.findall(r'

]>\s([^<]+?)\s*<|

]>\s([^<]+?)\s<)' r'.?(?:

]>\s([0-9./]+)|