Files
pgz-sport/scrapers/cultural_qa_seed.py_prije_env_deepseek

80 lines
3.3 KiB
Python

#!/usr/bin/env python3
import os
# Seed cultural_qa training data iz dabi.knowledge (Alan Ford, čakavski, satrovacki, izreke)
import psycopg2, hashlib, logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa] %(message)s')
log = logging.getLogger("cult_qa")
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Get cultural facts
cur.execute("""
SELECT id, fact, category FROM dabi.knowledge
WHERE category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik')
AND fact IS NOT NULL AND length(fact) > 30
LIMIT 1000
""")
rows = cur.fetchall()
log.info(f"Cultural facts: {len(rows)}")
inserted = 0
for fid, fact, cat in rows:
# Generate Q variants based on category
questions = []
fact_lower = fact.lower()
if 'alan_ford' in cat:
# Try extract character
for char in ['Alan Ford', 'Bob Rock', 'Sir Oliver', 'Broj Jedan', 'Grunf', 'Jeremija', 'Šef', 'Margot', 'Superhik', 'Notar', 'Cifra Sluga']:
if char.lower() in fact_lower:
questions.append(f"Tko je {char}?")
questions.append(f"Što znaš o liku {char}?")
break
if not questions:
questions.append(f"Što znaš o Alan Fordu?")
elif 'satrovacki' in cat:
# First word in fact = the term
words = fact.split()
if words and len(words[0]) > 2:
term = words[0].rstrip('=,:;.').strip()
questions.append(f"Što znači {term}?")
questions.append(f"Što je {term} na šatrovačkom?")
elif 'cakavski' in cat or 'fjumanski' in cat:
words = fact.split()
if words and len(words[0]) > 2:
term = words[0].rstrip('=,:;.').strip()
questions.append(f"Što znači {term}?")
questions.append(f"Što je {term} u riječkom dijalektu?")
elif 'rijeka_izreka' in cat:
questions.append(f"Reci mi neku riječku izreku.")
questions.append(f"Koje su tradicionalne riječke izreke?")
# Save Q&A pairs
for q in questions[:2]: # max 2 per fact
qa_hash = hashlib.sha256(f"cult:{fid}:{q[:60]}".encode()).hexdigest()[:32]
try:
cur.execute("""
INSERT INTO dabi.training_qa
(question, answer, category, source_type, created_at)
VALUES (%s, %s, %s, 'cultural_seed', now())
ON CONFLICT DO NOTHING
""", (q[:300], fact[:500], 'cultural_'+cat.split('_')[0]))
inserted += cur.rowcount
except Exception as e:
if inserted < 3: log.warning(f"insert err: {e}")
log.info(f"Inserted: {inserted} cultural Q&A pairs")
cur.close(); conn.close()
if __name__ == "__main__":
main()