79 lines
3.3 KiB
Python
79 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
# Seed cultural_qa training data iz dabi.knowledge (Alan Ford, čakavski, satrovacki, izreke)
|
|
import psycopg2, hashlib, logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa] %(message)s')
|
|
log = logging.getLogger("cult_qa")
|
|
DSN = "host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password=R1net2026!SecureDB#v7"
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor()
|
|
|
|
# Get cultural facts
|
|
cur.execute("""
|
|
SELECT id, fact, category FROM dabi.knowledge
|
|
WHERE category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
|
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
|
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
|
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik')
|
|
AND fact IS NOT NULL AND length(fact) > 30
|
|
LIMIT 1000
|
|
""")
|
|
rows = cur.fetchall()
|
|
log.info(f"Cultural facts: {len(rows)}")
|
|
|
|
inserted = 0
|
|
for fid, fact, cat in rows:
|
|
# Generate Q variants based on category
|
|
questions = []
|
|
fact_lower = fact.lower()
|
|
|
|
if 'alan_ford' in cat:
|
|
# Try extract character
|
|
for char in ['Alan Ford', 'Bob Rock', 'Sir Oliver', 'Broj Jedan', 'Grunf', 'Jeremija', 'Šef', 'Margot', 'Superhik', 'Notar', 'Cifra Sluga']:
|
|
if char.lower() in fact_lower:
|
|
questions.append(f"Tko je {char}?")
|
|
questions.append(f"Što znaš o liku {char}?")
|
|
break
|
|
if not questions:
|
|
questions.append(f"Što znaš o Alan Fordu?")
|
|
|
|
elif 'satrovacki' in cat:
|
|
# First word in fact = the term
|
|
words = fact.split()
|
|
if words and len(words[0]) > 2:
|
|
term = words[0].rstrip('=,:;.').strip()
|
|
questions.append(f"Što znači {term}?")
|
|
questions.append(f"Što je {term} na šatrovačkom?")
|
|
|
|
elif 'cakavski' in cat or 'fjumanski' in cat:
|
|
words = fact.split()
|
|
if words and len(words[0]) > 2:
|
|
term = words[0].rstrip('=,:;.').strip()
|
|
questions.append(f"Što znači {term}?")
|
|
questions.append(f"Što je {term} u riječkom dijalektu?")
|
|
|
|
elif 'rijeka_izreka' in cat:
|
|
questions.append(f"Reci mi neku riječku izreku.")
|
|
questions.append(f"Koje su tradicionalne riječke izreke?")
|
|
|
|
# Save Q&A pairs
|
|
for q in questions[:2]: # max 2 per fact
|
|
qa_hash = hashlib.sha256(f"cult:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO dabi.training_qa
|
|
(question, answer, category, source_type, created_at)
|
|
VALUES (%s, %s, %s, 'cultural_seed', now())
|
|
ON CONFLICT DO NOTHING
|
|
""", (q[:300], fact[:500], 'cultural_'+cat.split('_')[0]))
|
|
inserted += cur.rowcount
|
|
except Exception as e:
|
|
if inserted < 3: log.warning(f"insert err: {e}")
|
|
|
|
log.info(f"Inserted: {inserted} cultural Q&A pairs")
|
|
cur.close(); conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|