Files
pgz-sport/scrapers/cultural_qa_seed_v2.py_prije_env_deepseek

91 lines
4.0 KiB
Python

#!/usr/bin/env python3
import os
# Proširen cultural Q&A seed (svaki fact daje 3-5 varijanti pitanja)
import psycopg2, hashlib, logging, re
logging.basicConfig(level=logging.INFO, format='%(asctime)s [cult_qa2] %(message)s')
DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}"
def main():
conn = psycopg2.connect(DSN); conn.autocommit = True
cur = conn.cursor()
# Lokalni Riječki + dijalekti facts
cur.execute("""
SELECT id, fact, category FROM dabi.knowledge
WHERE (category IN ('alan_ford_priority','satrovacki_priority','cakavski_priority',
'fjumanski_priority','rijeka_izreka','lingvistika_qa',
'alan_ford_v3','alan_ford_qa','rijeka_alan','rijeka_humor',
'rijeka_lokalni','rijeka_qa','satrovacki_dict','satrovacki_jezik',
'pgz_administracija','pgz_promet','rijeka_lokali','rijeka_lokal')
OR fact ~ '\\m(žišku|brodo|rista|vopi|kantun|ponistra|šugaman)\\M'
OR fact ~ '\\m(Alan Ford|Bob Rock|Sir Oliver|TNT|Grunf)\\M')
AND fact IS NOT NULL AND length(fact) > 30 AND length(fact) < 1500
LIMIT 2000
""")
rows = cur.fetchall()
logging.info(f"Cultural facts proširen: {len(rows)}")
inserted = 0
for fid, fact, cat in rows:
questions = []
fl = fact.lower()
# Alan Ford characters
characters = ['Alan Ford','Bob Rock','Sir Oliver','Broj Jedan','Grunf','Jeremija','Šef','Margot','Superhik','Notar','Cifra Sluga','Don Galon','Debela Gilda']
for ch in characters:
if ch.lower() in fl:
questions.extend([f"Tko je {ch}?", f"Što znaš o {ch}?", f"Kakav je lik {ch}?"])
break
# Šatrovački/čakavski/fjumanski — extract first word as term
if any(k in cat.lower() for k in ['satrovacki', 'cakavski', 'fjumanski', 'lokalni']):
# Extract first interesting word (not common noun)
words = re.findall(r'\b\w+\b', fact)
for w in words[:3]:
if len(w) >= 3 and w.lower() not in ['ova', 'taj', 'jest', 'znači', 'što', 'kako', 'tko', 'gdje']:
questions.append(f"Što znači riječ {w}?")
questions.append(f"Što je {w}?")
break
# Riječke izreke
if 'izrek' in cat.lower() or 'izrek' in fl:
questions.append("Reci mi neku riječku izreku.")
questions.append("Imaš li primjer riječke poslovice?")
# General Rijeka context
if 'rijeka' in fl or 'kvarner' in fl or 'trsat' in fl or 'preluk' in fl:
questions.append("Što mi možeš reći o Rijeci?")
if not questions:
# Fallback Q based on category
cat_q = {
'alan_ford': 'Pričaj mi nešto o Alan Fordu.',
'cakavski': 'Pričaj mi o čakavskom dijalektu.',
'satrovacki': 'Što je šatrovački?',
'fjumanski': 'Što je fjumanski?',
'rijeka': 'Što je posebno za Rijeku?'
}
for k, v in cat_q.items():
if k in cat.lower():
questions.append(v)
break
# Save
for q in questions[:3]:
qa_hash = hashlib.sha256(f"cv2:{fid}:{q[:60]}".encode()).hexdigest()[:32]
try:
cur.execute("""
INSERT INTO dabi.training_qa
(question, answer, category, source_type, created_at)
VALUES (%s, %s, %s, 'cultural_seed_v2', now())
ON CONFLICT DO NOTHING
""", (q[:300], fact[:800], 'cultural_'+cat.split('_')[0][:20]))
inserted += cur.rowcount
except: pass
logging.info(f"Inserted: {inserted} cultural Q&A v2")
cur.close(); conn.close()
if __name__ == "__main__":
main()