#!/usr/bin/env python3 from dotenv import load_dotenv load_dotenv("/opt/rinet-gpu/.env.master") # auto-added """Common scraper helpers.""" import os, re, time, json, hashlib from urllib.parse import urljoin, urlparse, urlencode, quote import urllib.request from html import unescape import psycopg2 from psycopg2.extras import execute_batch DSN = f"host=10.10.0.2 port=6432 dbname=rinet_v3 user=rinet password={os.environ['DB_PASSWORD']}" UA = "Mozilla/5.0 (Ri.NET Civic Bot 1.0; contact: dradulic@outlook.com)" def fetch(url, timeout=20, retries=3, binary=False): for i in range(retries): try: req = urllib.request.Request(url, headers={"User-Agent": UA}) with urllib.request.urlopen(req, timeout=timeout) as r: d = r.read() return (d if binary else d.decode("utf-8", errors="replace")), r.status except Exception: time.sleep(2 * (i + 1)) return None, 0 def extract_text(html): if not html: return "" h = re.sub(r"]*>.*?", "", html, flags=re.S | re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S | re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S | re.I) h = re.sub(r"]*>.*?", "", h, flags=re.S | re.I) t = re.sub(r"<[^>]+>", " ", h) return re.sub(r"\s+", " ", unescape(t)).strip() def extract_title(html): m = re.search(r"([^<]+)", html or "", re.I) return re.sub(r"\s+", " ", unescape(m.group(1))).strip() if m else "" def chunk_text(text, max_len=800): if len(text) <= max_len: return [text] if text else [] out = [] start = 0 while start < len(text): end = min(start + max_len, len(text)) if end < len(text): for sep in [". ", "! ", "? ", "\n"]: p = text.rfind(sep, start, end) if p > start + max_len // 2: end = p + len(sep) break out.append(text[start:end].strip()) start = end return [c for c in out if len(c) > 80] def upsert_facts(conn, facts, source_name, category, confidence=0.85): if not facts: return 0 cur = conn.cursor() rows = [] for f in facts: f["fact"] = f["fact"].replace("\x00", "") h = hashlib.md5(f["fact"].encode()).hexdigest() rows.append((f["fact"], source_name, category, confidence, h, json.dumps({"url": f.get("url", ""), "title": f.get("title", "")}))) sql = ("INSERT INTO dabi.knowledge (fact, source, category, confidence, data_hash, source_refs) " "VALUES (%s, %s, %s, %s, %s, %s::jsonb) ON CONFLICT (data_hash) DO NOTHING") try: execute_batch(cur, sql, rows, page_size=50) n = cur.rowcount cur.close() return n except Exception: return 0 HREF_RE = re.compile("href=[\"']([^\"']+)[\"']") def find_internal_links(html, base_url): if not html: return [] base_host = urlparse(base_url).hostname or "" out = set() for m in HREF_RE.finditer(html): u = urljoin(base_url, m.group(1)) host = urlparse(u).hostname or "" if host == base_host: out.add(u.split("#")[0]) return list(out)