#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" # dependencies = [ # "httpx", # "beautifulsoup4", # "dateparser", # ] # /// import asyncio import httpx import subprocess from bs4 import BeautifulSoup from collections import Counter, defaultdict from dateutil import parser as date_parser SEARCHCRAFT_URL = "http://0.0.0.0:8000" INDEX_NAME = "thoughts-links" SQLITE_DB = "~/.config/thoughts/database2.db" response_counter = Counter() failure_reasons = defaultdict(list) def get_links(): command = f'sqlite3 {SQLITE_DB} "select link from post"' result = subprocess.run(command, shell=True, capture_output=True, text=True) return [ line.strip() for line in result.stdout.strip().split("\n") if line.strip() and line.startswith("http") ] def extract_content(html: str) -> str: soup = BeautifulSoup(html, "html.parser") selectors = [ "article", "div[itemprop=articleBody]", "main", "div[class*=post]", "div[class*=article]", "body", ] for selector in selectors: el = soup.select_one(selector) if el and el.get_text(strip=True): return el.get_text("\n", strip=True) return soup.get_text("\n", strip=True) def extract_metadata(soup: BeautifulSoup) -> dict: def get_meta(properties: list[str]) -> str | None: for prop in properties: tag = soup.find("meta", attrs={"property": prop}) or soup.find( "meta", attrs={"name": prop} ) if tag and tag.get("content"): return tag["content"].strip() return None title = ( get_meta(["og:title", "twitter:title", "title"]) or (soup.title.string.strip() if soup.title else None) or (soup.find("h1").get_text(strip=True) if soup.find("h1") else None) ) description = get_meta( ["og:description", "description", "twitter:description"] ) or (soup.find("p").get_text(strip=True) if soup.find("p") else None) image = get_meta(["og:image", "twitter:image", "image"]) created_at_raw = get_meta( [ "article:published_time", "og:published_time", "date", "published_time", "pubdate", ] ) or ( soup.find("time", {"datetime": True}).get("datetime") if soup.find("time", {"datetime": True}) else None ) created_at = None if created_at_raw: try: created_at = date_parser.parse(created_at_raw) except Exception: pass return { "title": title or "", "description": description or "", "image": image or "", "created_at": created_at, } async def fetch_and_parse(client, url): try: resp = await client.get(url, timeout=10) response_counter[resp.status_code] += 1 if resp.status_code in {402, 403, 429}: reasons = { 402: "Payment Required", 403: "Forbidden (Crawler?)", 429: "Too Many Requests", } failure_reasons[reasons[resp.status_code]].append(url) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") body = extract_content(resp.text) meta = extract_metadata(soup) return { "id": str(url), "url": str(url), "body": str(body), "title": str(meta["title"]), "description": str(meta["description"]), "image": str(meta["image"]), # "created_at": meta["created_at"], } except httpx.TimeoutException: failure_reasons["Timeout"].append(url) except httpx.ConnectError: failure_reasons["Connection Error"].append(url) except httpx.RequestError as e: failure_reasons["Request Error"].append(f"{url}: {str(e)}") except httpx.HTTPStatusError as e: failure_reasons["HTTP Error"].append(f"{url}: {str(e)}") except Exception as e: failure_reasons["Unknown"].append(f"{url}: {str(e)}") return None async def upload_documents(): links = get_links() print(f"\U0001f517 Found {len(links)} links") async with httpx.AsyncClient( headers={"User-Agent": "Mozilla/5.0 (compatible; LinkFetcher/1.0)"} ) as client: tasks = [fetch_and_parse(client, url) for url in links] results = await asyncio.gather(*tasks) documents = [doc for doc in results if doc] print(f"āœ… Parsed {len(documents)} valid documents") print("\nšŸ“Š Response Summary:") for code, count in response_counter.items(): print(f" {code}: {count}") total = sum(response_counter.values()) failures = total - response_counter[200] print(f"\nāŒ Failure Rate: {failures}/{total} ({failures / total:.2%})") if failure_reasons: print("\nšŸ” Failure Reasons:") for reason, urls in failure_reasons.items(): print(f" {reason}: {len(urls)}") # Upload if needed if documents: response = await client.post( f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/documents", json=documents ) response.raise_for_status() print("Uploaded:", response.json()) commit_resp = await client.post( f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/commit" ) commit_resp.raise_for_status() print("Committed:", commit_resp.json()) else: breakpoint() if __name__ == "__main__": asyncio.run(upload_documents())