init
This commit is contained in:
commit
a153bf1bb4
9 changed files with 1808 additions and 0 deletions
189
ingest_thoughts.py
Executable file
189
ingest_thoughts.py
Executable file
|
|
@ -0,0 +1,189 @@
|
|||
#!/usr/bin/env -S uv run --script
|
||||
# /// script
|
||||
# requires-python = ">=3.12"
|
||||
# dependencies = [
|
||||
# "httpx",
|
||||
# "beautifulsoup4",
|
||||
# "dateparser",
|
||||
# ]
|
||||
# ///
|
||||
import asyncio
|
||||
import httpx
|
||||
import subprocess
|
||||
from bs4 import BeautifulSoup
|
||||
from collections import Counter, defaultdict
|
||||
from dateutil import parser as date_parser
|
||||
|
||||
SEARCHCRAFT_URL = "http://0.0.0.0:8000"
|
||||
INDEX_NAME = "thoughts-links"
|
||||
SQLITE_DB = "~/.config/thoughts/database2.db"
|
||||
|
||||
response_counter = Counter()
|
||||
failure_reasons = defaultdict(list)
|
||||
|
||||
|
||||
def get_links():
|
||||
command = f'sqlite3 {SQLITE_DB} "select link from post"'
|
||||
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
||||
return [
|
||||
line.strip()
|
||||
for line in result.stdout.strip().split("\n")
|
||||
if line.strip() and line.startswith("http")
|
||||
]
|
||||
|
||||
|
||||
def extract_content(html: str) -> str:
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
selectors = [
|
||||
"article",
|
||||
"div[itemprop=articleBody]",
|
||||
"main",
|
||||
"div[class*=post]",
|
||||
"div[class*=article]",
|
||||
"body",
|
||||
]
|
||||
for selector in selectors:
|
||||
el = soup.select_one(selector)
|
||||
if el and el.get_text(strip=True):
|
||||
return el.get_text("\n", strip=True)
|
||||
return soup.get_text("\n", strip=True)
|
||||
|
||||
|
||||
def extract_metadata(soup: BeautifulSoup) -> dict:
|
||||
def get_meta(properties: list[str]) -> str | None:
|
||||
for prop in properties:
|
||||
tag = soup.find("meta", attrs={"property": prop}) or soup.find(
|
||||
"meta", attrs={"name": prop}
|
||||
)
|
||||
if tag and tag.get("content"):
|
||||
return tag["content"].strip()
|
||||
return None
|
||||
|
||||
title = (
|
||||
get_meta(["og:title", "twitter:title", "title"])
|
||||
or (soup.title.string.strip() if soup.title else None)
|
||||
or (soup.find("h1").get_text(strip=True) if soup.find("h1") else None)
|
||||
)
|
||||
|
||||
description = get_meta(
|
||||
["og:description", "description", "twitter:description"]
|
||||
) or (soup.find("p").get_text(strip=True) if soup.find("p") else None)
|
||||
|
||||
image = get_meta(["og:image", "twitter:image", "image"])
|
||||
|
||||
created_at_raw = get_meta(
|
||||
[
|
||||
"article:published_time",
|
||||
"og:published_time",
|
||||
"date",
|
||||
"published_time",
|
||||
"pubdate",
|
||||
]
|
||||
) or (
|
||||
soup.find("time", {"datetime": True}).get("datetime")
|
||||
if soup.find("time", {"datetime": True})
|
||||
else None
|
||||
)
|
||||
|
||||
created_at = None
|
||||
if created_at_raw:
|
||||
try:
|
||||
created_at = date_parser.parse(created_at_raw)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"title": title or "",
|
||||
"description": description or "",
|
||||
"image": image or "",
|
||||
"created_at": created_at,
|
||||
}
|
||||
|
||||
|
||||
async def fetch_and_parse(client, url):
|
||||
try:
|
||||
resp = await client.get(url, timeout=10)
|
||||
response_counter[resp.status_code] += 1
|
||||
|
||||
if resp.status_code in {402, 403, 429}:
|
||||
reasons = {
|
||||
402: "Payment Required",
|
||||
403: "Forbidden (Crawler?)",
|
||||
429: "Too Many Requests",
|
||||
}
|
||||
failure_reasons[reasons[resp.status_code]].append(url)
|
||||
|
||||
resp.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
body = extract_content(resp.text)
|
||||
meta = extract_metadata(soup)
|
||||
|
||||
return {
|
||||
"id": str(url),
|
||||
"url": str(url),
|
||||
"body": str(body),
|
||||
"title": str(meta["title"]),
|
||||
"description": str(meta["description"]),
|
||||
"image": str(meta["image"]),
|
||||
# "created_at": meta["created_at"],
|
||||
}
|
||||
|
||||
except httpx.TimeoutException:
|
||||
failure_reasons["Timeout"].append(url)
|
||||
except httpx.ConnectError:
|
||||
failure_reasons["Connection Error"].append(url)
|
||||
except httpx.RequestError as e:
|
||||
failure_reasons["Request Error"].append(f"{url}: {str(e)}")
|
||||
except httpx.HTTPStatusError as e:
|
||||
failure_reasons["HTTP Error"].append(f"{url}: {str(e)}")
|
||||
except Exception as e:
|
||||
failure_reasons["Unknown"].append(f"{url}: {str(e)}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def upload_documents():
|
||||
links = get_links()
|
||||
print(f"\U0001f517 Found {len(links)} links")
|
||||
|
||||
async with httpx.AsyncClient(
|
||||
headers={"User-Agent": "Mozilla/5.0 (compatible; LinkFetcher/1.0)"}
|
||||
) as client:
|
||||
tasks = [fetch_and_parse(client, url) for url in links]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
documents = [doc for doc in results if doc]
|
||||
print(f"✅ Parsed {len(documents)} valid documents")
|
||||
|
||||
print("\n📊 Response Summary:")
|
||||
for code, count in response_counter.items():
|
||||
print(f" {code}: {count}")
|
||||
|
||||
total = sum(response_counter.values())
|
||||
failures = total - response_counter[200]
|
||||
print(f"\n❌ Failure Rate: {failures}/{total} ({failures / total:.2%})")
|
||||
|
||||
if failure_reasons:
|
||||
print("\n🔍 Failure Reasons:")
|
||||
for reason, urls in failure_reasons.items():
|
||||
print(f" {reason}: {len(urls)}")
|
||||
|
||||
# Upload if needed
|
||||
if documents:
|
||||
response = await client.post(
|
||||
f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/documents", json=documents
|
||||
)
|
||||
response.raise_for_status()
|
||||
print("Uploaded:", response.json())
|
||||
commit_resp = await client.post(
|
||||
f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/commit"
|
||||
)
|
||||
commit_resp.raise_for_status()
|
||||
print("Committed:", commit_resp.json())
|
||||
else:
|
||||
breakpoint()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(upload_documents())
|
||||
Loading…
Add table
Add a link
Reference in a new issue