PGŽ Sport Platform — Round 1+2 baseline (sport2.html + API)
This commit is contained in:
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env python3
|
||||
# Proširen cultural Q&A seed (svaki fact daje 3-5 varijanti pitanja)
|
||||
import psycopg2, hashlib, logging, re
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa2] %(message)s')
|
||||
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
||||
|
||||
def main():
|
||||
conn = psycopg2.connect(DSN); conn.autocommit = True
|
||||
cur = conn.cursor()
|
||||
|
||||
# Lokalni Riječki + dijalekti facts
|
||||
cur.execute("""
|
||||
SELECT id, fact, category FROM dabi.knowledge
|
||||
WHERE (category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
||||
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
||||
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
||||
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik',
|
||||
'pgz_administracija','pgz_promet','rijeka_lokali','rijeka_lokal')
|
||||
OR fact ~ '\\m(žišku|brodo|rista|vopi|kantun|ponistra|šugaman)\\M'
|
||||
OR fact ~ '\\m(Alan Ford|Bob Rock|Sir Oliver|TNT|Grunf)\\M')
|
||||
AND fact IS NOT NULL AND length(fact) > 30 AND length(fact) < 1500
|
||||
LIMIT 2000
|
||||
""")
|
||||
rows = cur.fetchall()
|
||||
logging.info(f"Cultural facts proširen: {len(rows)}")
|
||||
|
||||
inserted = 0
|
||||
for fid, fact, cat in rows:
|
||||
questions = []
|
||||
fl = fact.lower()
|
||||
|
||||
# Alan Ford characters
|
||||
characters = ['Alan Ford','Bob Rock','Sir Oliver','Broj Jedan','Grunf','Jeremija','Šef','Margot','Superhik','Notar','Cifra Sluga','Don Galon','Debela Gilda']
|
||||
for ch in characters:
|
||||
if ch.lower() in fl:
|
||||
questions.extend([f"Tko je {ch}?", f"Što znaš o {ch}?", f"Kakav je lik {ch}?"])
|
||||
break
|
||||
|
||||
# Šatrovački/čakavski/fjumanski — extract first word as term
|
||||
if any(k in cat.lower() for k in ['satrovacki', 'cakavski', 'fjumanski', 'lokalni']):
|
||||
# Extract first interesting word (not common noun)
|
||||
words = re.findall(r'\b\w+\b', fact)
|
||||
for w in words[:3]:
|
||||
if len(w) >= 3 and w.lower() not in ['ova', 'taj', 'jest', 'znači', 'što', 'kako', 'tko', 'gdje']:
|
||||
questions.append(f"Što znači riječ {w}?")
|
||||
questions.append(f"Što je {w}?")
|
||||
break
|
||||
|
||||
# Riječke izreke
|
||||
if 'izrek' in cat.lower() or 'izrek' in fl:
|
||||
questions.append("Reci mi neku riječku izreku.")
|
||||
questions.append("Imaš li primjer riječke poslovice?")
|
||||
|
||||
# General Rijeka context
|
||||
if 'rijeka' in fl or 'kvarner' in fl or 'trsat' in fl or 'preluk' in fl:
|
||||
questions.append("Što mi možeš reći o Rijeci?")
|
||||
|
||||
if not questions:
|
||||
# Fallback Q based on category
|
||||
cat_q = {
|
||||
'alan_ford': 'Pričaj mi nešto o Alan Fordu.',
|
||||
'cakavski': 'Pričaj mi o čakavskom dijalektu.',
|
||||
'satrovacki': 'Što je šatrovački?',
|
||||
'fjumanski': 'Što je fjumanski?',
|
||||
'rijeka': 'Što je posebno za Rijeku?'
|
||||
}
|
||||
for k, v in cat_q.items():
|
||||
if k in cat.lower():
|
||||
questions.append(v)
|
||||
break
|
||||
|
||||
# Save
|
||||
for q in questions[:3]:
|
||||
qa_hash = hashlib.sha256(f"cv2:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO dabi.training_qa
|
||||
(question, answer, category, source_type, created_at)
|
||||
VALUES (%s, %s, %s, 'cultural_seed_v2', now())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (q[:300], fact[:800], 'cultural_'+cat.split('_')[0][:20]))
|
||||
inserted += cur.rowcount
|
||||
except: pass
|
||||
|
||||
logging.info(f"Inserted: {inserted} cultural Q&A v2")
|
||||
cur.close(); conn.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user