PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
# Seed cultural_qa training data iz dabi.knowledge (Alan Ford, čakavski, satrovacki, izreke)
|
||||
import psycopg2, hashlib, logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa] %(message)s')
|
||||
log = logging.getLogger("cult_qa")
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get cultural facts
|
||||
cur.execute("""
|
||||
SELECT id, fact, category FROM dabi.knowledge
|
||||
WHERE category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
||||
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
||||
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
||||
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik')
|
||||
AND fact IS NOT NULL AND length(fact) > 30
|
||||
LIMIT 1000
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
log.info(f"Cultural facts: {len(rows)}")
|
||||
|
||||
inserted = 0
|
||||
for fid, fact, cat in rows:
|
||||
# Generate Q variants based on category
|
||||
questions = []
|
||||
fact_lower = fact.lower()
|
||||
|
||||
if 'alan_ford' in cat:
|
||||
# Try extract character
|
||||
for char in ['Alan Ford', 'Bob Rock', 'Sir Oliver', 'Broj Jedan', 'Grunf', 'Jeremija', 'Šef', 'Margot', 'Superhik', 'Notar', 'Cifra Sluga']:
|
||||
if char.lower() in fact_lower:
|
||||
questions.append(f"Tko je {char}?")
|
||||
questions.append(f"Što znaš o liku {char}?")
|
||||
break
|
||||
if not questions:
|
||||
questions.append(f"Što znaš o Alan Fordu?")
|
||||
|
||||
elif 'satrovacki' in cat:
|
||||
# First word in fact = the term
|
||||
words = fact.split()
|
||||
if words and len(words[0]) > 2:
|
||||
term = words[0].rstrip('=,:;.').strip()
|
||||
questions.append(f"Što znači {term}?")
|
||||
questions.append(f"Što je {term} na šatrovačkom?")
|
||||
|
||||
elif 'cakavski' in cat or 'fjumanski' in cat:
|
||||
words = fact.split()
|
||||
if words and len(words[0]) > 2:
|
||||
term = words[0].rstrip('=,:;.').strip()
|
||||
questions.append(f"Što znači {term}?")
|
||||
questions.append(f"Što je {term} u riječkom dijalektu?")
|
||||
|
||||
elif 'rijeka_izreka' in cat:
|
||||
questions.append(f"Reci mi neku riječku izreku.")
|
||||
questions.append(f"Koje su tradicionalne riječke izreke?")
|
||||
|
||||
# Save Q&A pairs
|
||||
for q in questions[:2]: # max 2 per fact
|
||||
qa_hash = hashlib.sha256(f"cult:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.training_qa
|
||||
(question, answer, category, source_type, created_at)
|
||||
VALUES (%s, %s, %s, 'cultural_seed', now())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (q[:300], fact[:500], 'cultural_'+cat.split('_')[0]))
|
||||
inserted += cur.rowcount
|
||||
except Exception as e:
|
||||
if inserted < 3: log.warning(f"insert err: {e}")
|
||||
|
||||
log.info(f"Inserted: {inserted} cultural Q&A pairs")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user