91 lines
4.0 KiB
Python
91 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
# Proširen cultural Q&A seed (svaki fact daje 3-5 varijanti pitanja)
|
|
import psycopg2, hashlib, logging, re
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa2] %(message)s')
|
|
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
|
|
|
|
def main():
|
|
conn = psycopg2.connect(DSN); conn.autocommit = True
|
|
cur = conn.cursor()
|
|
|
|
# Lokalni Riječki + dijalekti facts
|
|
cur.execute("""
|
|
SELECT id, fact, category FROM dabi.knowledge
|
|
WHERE (category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
|
|
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
|
|
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
|
|
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik',
|
|
'pgz_administracija','pgz_promet','rijeka_lokali','rijeka_lokal')
|
|
OR fact ~ '\\m(žišku|brodo|rista|vopi|kantun|ponistra|šugaman)\\M'
|
|
OR fact ~ '\\m(Alan Ford|Bob Rock|Sir Oliver|TNT|Grunf)\\M')
|
|
AND fact IS NOT NULL AND length(fact) > 30 AND length(fact) < 1500
|
|
LIMIT 2000
|
|
""")
|
|
rows = cur.fetchall()
|
|
logging.info(f"Cultural facts proširen: {len(rows)}")
|
|
|
|
inserted = 0
|
|
for fid, fact, cat in rows:
|
|
questions = []
|
|
fl = fact.lower()
|
|
|
|
# Alan Ford characters
|
|
characters = ['Alan Ford','Bob Rock','Sir Oliver','Broj Jedan','Grunf','Jeremija','Šef','Margot','Superhik','Notar','Cifra Sluga','Don Galon','Debela Gilda']
|
|
for ch in characters:
|
|
if ch.lower() in fl:
|
|
questions.extend([f"Tko je {ch}?", f"Što znaš o {ch}?", f"Kakav je lik {ch}?"])
|
|
break
|
|
|
|
# Šatrovački/čakavski/fjumanski — extract first word as term
|
|
if any(k in cat.lower() for k in ['satrovacki', 'cakavski', 'fjumanski', 'lokalni']):
|
|
# Extract first interesting word (not common noun)
|
|
words = re.findall(r'\b\w+\b', fact)
|
|
for w in words[:3]:
|
|
if len(w) >= 3 and w.lower() not in ['ova', 'taj', 'jest', 'znači', 'što', 'kako', 'tko', 'gdje']:
|
|
questions.append(f"Što znači riječ {w}?")
|
|
questions.append(f"Što je {w}?")
|
|
break
|
|
|
|
# Riječke izreke
|
|
if 'izrek' in cat.lower() or 'izrek' in fl:
|
|
questions.append("Reci mi neku riječku izreku.")
|
|
questions.append("Imaš li primjer riječke poslovice?")
|
|
|
|
# General Rijeka context
|
|
if 'rijeka' in fl or 'kvarner' in fl or 'trsat' in fl or 'preluk' in fl:
|
|
questions.append("Što mi možeš reći o Rijeci?")
|
|
|
|
if not questions:
|
|
# Fallback Q based on category
|
|
cat_q = {
|
|
'alan_ford': 'Pričaj mi nešto o Alan Fordu.',
|
|
'cakavski': 'Pričaj mi o čakavskom dijalektu.',
|
|
'satrovacki': 'Što je šatrovački?',
|
|
'fjumanski': 'Što je fjumanski?',
|
|
'rijeka': 'Što je posebno za Rijeku?'
|
|
}
|
|
for k, v in cat_q.items():
|
|
if k in cat.lower():
|
|
questions.append(v)
|
|
break
|
|
|
|
# Save
|
|
for q in questions[:3]:
|
|
qa_hash = hashlib.sha256(f"cv2:{fid}:{q[:60]}".encode()).hexdigest()[:32]
|
|
try:
|
|
cur.execute("""
|
|
INSERT INTO dabi.training_qa
|
|
(question, answer, category, source_type, created_at)
|
|
VALUES (%s, %s, %s, 'cultural_seed_v2', now())
|
|
ON CONFLICT DO NOTHING
|
|
""", (q[:300], fact[:800], 'cultural_'+cat.split('_')[0][:20]))
|
|
inserted += cur.rowcount
|
|
except: pass
|
|
|
|
logging.info(f"Inserted: {inserted} cultural Q&A v2")
|
|
cur.close(); conn.close()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|