This commit is contained in:
Waylon Walker 2025-07-15 12:30:09 -05:00
commit a153bf1bb4
9 changed files with 1808 additions and 0 deletions

965
.gitignore vendored Normal file
View file

@ -0,0 +1,965 @@
# Created by https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
# Edit at https://www.toptal.com/developers/gitignore?templates=vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
### Data ###
*.csv
*.dat
*.efx
*.gbr
*.key
*.pps
*.ppt
*.pptx
*.sdf
*.tax2010
*.vcf
*.xml
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
### Executable ###
*.app
*.bat
*.cgi
*.com
*.exe
*.gadget
*.jar
*.pif
*.vb
*.wsf
### Node ###
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*
# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
*.lcov
# nyc test coverage
.nyc_output
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
jspm_packages/
# Snowpack dependency directory (https://snowpack.dev/)
web_modules/
# TypeScript cache
*.tsbuildinfo
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional stylelint cache
.stylelintcache
# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local
# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache
# Next.js build output
.next
out
# Nuxt.js build / generate output
.nuxt
dist
# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public
# vuepress build output
.vuepress/dist
# vuepress v2.x temp and cache directory
.temp
# Docusaurus cache and generated files
.docusaurus
# Serverless directories
.serverless/
# FuseBox cache
.fusebox/
# DynamoDB Local files
.dynamodb/
# TernJS port file
.tern-port
# Stores VSCode versions used for testing VSCode extensions
.vscode-test
# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
### Node Patch ###
# Serverless Webpack directories
.webpack/
# Optional stylelint cache
# SvelteKit build / generate output
.svelte-kit
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# Azure Toolkit for IntelliJ plugin
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
.idea/**/azureSettings.xml
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
# /site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### SublimeText ###
# Cache files for Sublime Text
*.tmlanguage.cache
*.tmPreferences.cache
*.stTheme.cache
# Workspace files are user-specific
*.sublime-workspace
# Project files should be checked into the repository, unless a significant
# proportion of contributors will probably not be using Sublime Text
# *.sublime-project
# SFTP configuration file
sftp-config.json
sftp-config-alt*.json
# Package control specific files
Package Control.last-run
Package Control.ca-list
Package Control.ca-bundle
Package Control.system-ca-bundle
Package Control.cache/
Package Control.ca-certs/
Package Control.merged-ca-bundle
Package Control.user-ca-bundle
oscrypto-ca-bundle.crt
bh_unicode_properties.cache
# Sublime-github package stores a github token in this file
# https://packagecontrol.io/packages/sublime-github
GitHub.sublime-settings
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
### VisualStudio ###
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Mono auto generated files
mono_crash.*
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
[Ww][Ii][Nn]32/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUnit
*.VisualState.xml
TestResult.xml
nunit-*.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.tlog
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Coverlet is a free, cross platform Code Coverage Tool
coverage*.json
coverage*.xml
coverage*.info
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# NuGet Symbol Packages
*.snupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
*.appxbundle
*.appxupload
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!?*.[Cc]ache/
# Others
ClientBin/
~$*
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
*- [Bb]ackup.rdl
*- [Bb]ackup ([0-9]).rdl
*- [Bb]ackup ([0-9][0-9]).rdl
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio 6 auto-generated project file (contains which files were open etc.)
*.vbp
# Visual Studio 6 workspace and project file (working project files containing files to include in project)
*.dsw
*.dsp
# Visual Studio 6 technical files
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# Visual Studio History (VSHistory) files
.vshistory/
# BeatPulse healthcheck temp database
healthchecksdb
# Backup folder for Package Reference Convert tool in Visual Studio 2017
MigrationBackup/
# Ionide (cross platform F# VS Code tools) working folder
.ionide/
# Fody - auto-generated XML schema
FodyWeavers.xsd
# VS Code files for those working on multiple tools
*.code-workspace
# Local History for Visual Studio Code
# Windows Installer files from build outputs
*.cab
*.msi
*.msix
*.msm
*.msp
# JetBrains Rider
*.sln.iml
### VisualStudio Patch ###
# Additional files built by Visual Studio
# End of https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
**/.null-ls*

126
create-schema.sh Executable file
View file

@ -0,0 +1,126 @@
curl -X POST -H "Content-Type: application/json" --data '
{
"index": {
"name": "waylonwalker.com",
"search_fields": [
"title",
"body",
"tags",
"url"
],
"fields": {
"id": {
"type": "text",
"required": true,
"stored": true,
"indexed": false
},
"created_at": {
"type": "datetime",
"fast": true,
"stored": true,
"indexed": true
},
"title": {
"type": "text",
"stored": true
},
"body": {
"type": "text",
"stored": true
},
"active": {
"type": "bool",
"fast": true,
"stored": true
},
"rating": {
"type": "f64",
"stored": true,
"fast": true
},
"reviews": {
"type": "u64",
"stored": true,
"fast": true
},
"tags": {
"type": "text",
"stored": true,
"multi": true
},
"category": {
"type": "facet",
"stored": true
},
"formats": {
"type": "facet",
"stored": true,
"multi": true
},
"url": {
"type": "text",
"stored": true
}
},
"weight_multipliers": {
"title": 2,
"body": 0.7
}
}
}
' http://0.0.0.0:8000/index
curl -X POST -H "Content-Type: application/json" --data '
{
"index": {
"name": "thoughts-links",
"search_fields": [
"title",
"body",
"url"
],
"fields": {
"id": {
"type": "text",
"required": true,
"stored": true,
"indexed": false
},
"created_at": {
"type": "datetime",
"fast": true,
"stored": true,
"indexed": true
},
"title": {
"type": "text",
"stored": true
},
"body": {
"type": "text",
"stored": true
},
"description": {
"type": "text",
"stored": true
},
"image": {
"type": "text",
"stored": true
},
"url": {
"type": "text",
"stored": true
}
},
"weight_multipliers": {
"title": 2,
"body": 0.7,
"description": 0.7
}
}
}
' http://0.0.0.0:8000/index

189
ingest_thoughts.py Executable file
View file

@ -0,0 +1,189 @@
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "httpx",
# "beautifulsoup4",
# "dateparser",
# ]
# ///
import asyncio
import httpx
import subprocess
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from dateutil import parser as date_parser
SEARCHCRAFT_URL = "http://0.0.0.0:8000"
INDEX_NAME = "thoughts-links"
SQLITE_DB = "~/.config/thoughts/database2.db"
response_counter = Counter()
failure_reasons = defaultdict(list)
def get_links():
command = f'sqlite3 {SQLITE_DB} "select link from post"'
result = subprocess.run(command, shell=True, capture_output=True, text=True)
return [
line.strip()
for line in result.stdout.strip().split("\n")
if line.strip() and line.startswith("http")
]
def extract_content(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
selectors = [
"article",
"div[itemprop=articleBody]",
"main",
"div[class*=post]",
"div[class*=article]",
"body",
]
for selector in selectors:
el = soup.select_one(selector)
if el and el.get_text(strip=True):
return el.get_text("\n", strip=True)
return soup.get_text("\n", strip=True)
def extract_metadata(soup: BeautifulSoup) -> dict:
def get_meta(properties: list[str]) -> str | None:
for prop in properties:
tag = soup.find("meta", attrs={"property": prop}) or soup.find(
"meta", attrs={"name": prop}
)
if tag and tag.get("content"):
return tag["content"].strip()
return None
title = (
get_meta(["og:title", "twitter:title", "title"])
or (soup.title.string.strip() if soup.title else None)
or (soup.find("h1").get_text(strip=True) if soup.find("h1") else None)
)
description = get_meta(
["og:description", "description", "twitter:description"]
) or (soup.find("p").get_text(strip=True) if soup.find("p") else None)
image = get_meta(["og:image", "twitter:image", "image"])
created_at_raw = get_meta(
[
"article:published_time",
"og:published_time",
"date",
"published_time",
"pubdate",
]
) or (
soup.find("time", {"datetime": True}).get("datetime")
if soup.find("time", {"datetime": True})
else None
)
created_at = None
if created_at_raw:
try:
created_at = date_parser.parse(created_at_raw)
except Exception:
pass
return {
"title": title or "",
"description": description or "",
"image": image or "",
"created_at": created_at,
}
async def fetch_and_parse(client, url):
try:
resp = await client.get(url, timeout=10)
response_counter[resp.status_code] += 1
if resp.status_code in {402, 403, 429}:
reasons = {
402: "Payment Required",
403: "Forbidden (Crawler?)",
429: "Too Many Requests",
}
failure_reasons[reasons[resp.status_code]].append(url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
body = extract_content(resp.text)
meta = extract_metadata(soup)
return {
"id": str(url),
"url": str(url),
"body": str(body),
"title": str(meta["title"]),
"description": str(meta["description"]),
"image": str(meta["image"]),
# "created_at": meta["created_at"],
}
except httpx.TimeoutException:
failure_reasons["Timeout"].append(url)
except httpx.ConnectError:
failure_reasons["Connection Error"].append(url)
except httpx.RequestError as e:
failure_reasons["Request Error"].append(f"{url}: {str(e)}")
except httpx.HTTPStatusError as e:
failure_reasons["HTTP Error"].append(f"{url}: {str(e)}")
except Exception as e:
failure_reasons["Unknown"].append(f"{url}: {str(e)}")
return None
async def upload_documents():
links = get_links()
print(f"\U0001f517 Found {len(links)} links")
async with httpx.AsyncClient(
headers={"User-Agent": "Mozilla/5.0 (compatible; LinkFetcher/1.0)"}
) as client:
tasks = [fetch_and_parse(client, url) for url in links]
results = await asyncio.gather(*tasks)
documents = [doc for doc in results if doc]
print(f"✅ Parsed {len(documents)} valid documents")
print("\n📊 Response Summary:")
for code, count in response_counter.items():
print(f" {code}: {count}")
total = sum(response_counter.values())
failures = total - response_counter[200]
print(f"\n❌ Failure Rate: {failures}/{total} ({failures / total:.2%})")
if failure_reasons:
print("\n🔍 Failure Reasons:")
for reason, urls in failure_reasons.items():
print(f" {reason}: {len(urls)}")
# Upload if needed
if documents:
response = await client.post(
f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/documents", json=documents
)
response.raise_for_status()
print("Uploaded:", response.json())
commit_resp = await client.post(
f"{SEARCHCRAFT_URL}/index/{INDEX_NAME}/commit"
)
commit_resp.raise_for_status()
print("Committed:", commit_resp.json())
else:
breakpoint()
if __name__ == "__main__":
asyncio.run(upload_documents())

7
justfile Normal file
View file

@ -0,0 +1,7 @@
serve:
python -m http.server 8080 -d site
create-schema:
./create-schema.sh
run-searchcraft:
podman run --name searchcraft -p 8000:8000 docker.io/searchcraftinc/searchcraft-core:latest

166
pages/query.md Normal file
View file

@ -0,0 +1,166 @@
# 🔍 Searchcraft Query Guide
Searchcraft lets you construct powerful search queries with structured JSON. This guide shows how to:
* Combine **exact and fuzzy** queries
* Search logs **within a time range**
* Use **curl**, **Python (httpx)**, and **JavaScript (fetch)** to query
---
## ✅ Mixing Exact and Fuzzy Queries
You can combine multiple types using a `boolean` query with `must`, `should`, or `must_not` clauses.
### JSON Query Example
```json
{
"query": {
"type": "boolean",
"must": [
{ "type": "term", "field": "username", "value": "wyatt" },
{ "type": "fuzzy", "field": "message", "value": "falcon" }
]
}
}
```
---
## 🕓 Searching Logs by Time Range
Use a `range` query on a timestamp field:
```json
{
"query": {
"type": "range",
"field": "timestamp",
"gte": "now-24h",
"lte": "now"
}
}
```
Or combine it:
```json
{
"query": {
"type": "boolean",
"must": [
{ "type": "term", "field": "username", "value": "wyatt" },
{
"type": "range",
"field": "timestamp",
"gte": "now-24h",
"lte": "now"
},
{ "type": "fuzzy", "field": "message", "value": "falcon" }
]
}
}
```
---
## 💻 curl Example
```bash
curl -X POST https://your.searchcraft.domain/index/logs/search \
-H "Content-Type: application/json" \
-d '{
"query": {
"type": "boolean",
"must": [
{ "type": "term", "field": "username", "value": "wyatt" },
{ "type": "fuzzy", "field": "message", "value": "falcon" },
{
"type": "range",
"field": "timestamp",
"gte": "now-24h",
"lte": "now"
}
]
}
}'
```
---
## 🐍 Python (httpx) Example
```python
import httpx
payload = {
"query": {
"type": "boolean",
"must": [
{"type": "term", "field": "username", "value": "wyatt"},
{"type": "fuzzy", "field": "message", "value": "falcon"},
{
"type": "range",
"field": "timestamp",
"gte": "now-24h",
"lte": "now"
}
]
}
}
response = httpx.post(
"https://your.searchcraft.domain/index/logs/search",
json=payload
)
print(response.json())
```
---
## 🌐 JavaScript (fetch) Example
```js
const payload = {
query: {
type: "boolean",
must: [
{ type: "term", field: "username", value: "wyatt" },
{ type: "fuzzy", field: "message", value: "falcon" },
{
type: "range",
field: "timestamp",
gte: "now-24h",
lte: "now"
}
]
}
};
fetch("https://your.searchcraft.domain/index/logs/search", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload)
})
.then(res => res.json())
.then(console.log)
.catch(console.error);
```
---
## 🧠 Summary Table
| Task | Query Type |
| ------------------------------ | ---------- |
| Exact match | `term` |
| Fuzzy match (typo-tolerant) | `fuzzy` |
| Filter by date or number range | `range` |
| Combine multiple conditions | `boolean` |
| OR logic | `should` |
| Exclude conditions | `must_not` |
---
Want help customizing this to your logs/index schema? Just ask!

30
proxy.py Normal file
View file

@ -0,0 +1,30 @@
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse
import httpx
SEARCHCRAFT_URL = "http://0.0.0.0:8000"
app = FastAPI()
# Allow frontend to talk to this proxy (can restrict origins if needed)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Change to your domain in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.post("/index/{index}/search")
async def proxy_search(index: str, request: Request):
"""Proxy search request to Searchcraft backend"""
body = await request.body()
async with httpx.AsyncClient() as client:
response = await client.post(
f"{SEARCHCRAFT_URL}/index/{index}/search",
content=body,
headers={"Content-Type": "application/json"},
)
return JSONResponse(content=response.json(), status_code=response.status_code)

103
site/index.html Normal file
View file

@ -0,0 +1,103 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Searchcraft UI</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-gray-900 text-white min-h-screen flex items-center justify-center px-4">
<div class="w-full max-w-2xl text-center">
<h1 class="text-3xl font-bold mb-4">🔍 Searchcraft</h1>
<div class="mb-4 flex flex-col md:flex-row gap-2 items-center justify-center">
<label for="index-select" class="text-gray-300 mr-2">Index:</label>
<select id="index-select" class="bg-gray-800 border border-gray-700 text-white p-2 rounded-md">
<option value="thoughts-links">thoughts-links</option>
<option value="waylonwalker.com">blog-posts</option>
</select>
</div>
<input
id="search-query"
type="text"
placeholder="Start typing to search..."
class="w-full p-4 text-lg rounded-md bg-gray-800 border border-gray-700 focus:outline-none focus:ring-2 focus:ring-cyan-500"
oninput="debouncedSearch()"
/>
<div id="results" class="mt-8 space-y-4 text-left"></div>
</div>
<script>
const BASE_URL = "http://localhost:8081";
let debounceTimer = null;
function getSelectedIndex() {
return document.getElementById("index-select").value;
}
function debouncedSearch() {
clearTimeout(debounceTimer);
debounceTimer = setTimeout(search, 300);
}
async function search() {
const query = document.getElementById("search-query").value;
const resBox = document.getElementById("results");
const index = getSelectedIndex();
resBox.innerHTML = '';
if (!query) return;
const response = await fetch(`${BASE_URL}/index/${index}/search`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
limit: 10,
offset: 0,
query: { fuzzy: { ctx: query } }
}),
});
const result = await response.json();
const hits = result?.data?.hits || [];
for (const hit of hits) {
const { title, description, body, url, image } = hit.doc;
const preview = (description || body || "").slice(0, 1000) + ((description || body).length > 1000 ? "..." : "");
const link = url || "#";
const card = document.createElement("a");
card.href = link;
card.target = "_blank";
card.rel = "noopener noreferrer";
card.className = "flex gap-4 bg-gray-800 border border-gray-700 rounded-lg overflow-hidden hover:border-cyan-500 transition";
if (image) {
const img = document.createElement("img");
img.src = image;
img.alt = title || "preview";
img.className = "w-32 h-32 object-cover flex-shrink-0";
card.appendChild(img);
}
const content = document.createElement("div");
content.className = "p-4 flex flex-col justify-center";
const titleEl = document.createElement("h3");
titleEl.className = "text-lg font-semibold text-cyan-400 hover:underline";
titleEl.textContent = title || "Untitled";
const descEl = document.createElement("p");
descEl.className = "text-gray-300 text-sm mt-1";
descEl.textContent = preview || "(no description)";
content.appendChild(titleEl);
content.appendChild(descEl);
card.appendChild(content);
resBox.appendChild(card);
}
}
</script>
</body>
</html>

145
site/v1/index.html Normal file
View file

@ -0,0 +1,145 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Searchcraft UI</title>
<style>
body {
font-family: sans-serif;
background: #1e1e1e;
color: #eee;
padding: 2rem;
}
input, textarea, button {
margin: 0.5rem 0;
width: 100%;
padding: 0.5rem;
font-size: 1rem;
}
textarea { height: 100px; }
.section {
margin-bottom: 2rem;
border-bottom: 1px solid #444;
padding-bottom: 1rem;
}
.results {
display: flex;
flex-wrap: wrap;
gap: 1rem;
}
.card {
background: #2a2a2a;
border: 1px solid #444;
border-radius: 0.5rem;
padding: 1rem;
width: calc(50% - 1rem);
}
.card h3 {
margin: 0 0 0.5rem;
color: #00ffe0;
}
.card p {
margin: 0;
}
</style>
</head>
<body>
<h1>🔍 Searchcraft UI</h1>
<div class="section">
<h2>📄 Upload Document</h2>
<input id="doc-id" placeholder="Document ID (required)" />
<input id="doc-title" placeholder="Title" />
<textarea id="doc-body" placeholder="Body"></textarea>
<button onclick="uploadDocument()">Upload</button>
</div>
<div class="section">
<h2>✅ Commit Changes</h2>
<button onclick="commitChanges()">Commit</button>
</div>
<div class="section">
<h2>🔎 Search</h2>
<input id="search-query" placeholder="Search query..." oninput="debouncedSearch()" />
<div class="results" id="results"></div>
</div>
<script>
const BASE_URL = "http://0.0.0.0:8000";
const INDEX = "thoughts-links";
let debounceTimer = null;
async function uploadDocument() {
const id = document.getElementById("doc-id").value;
const title = document.getElementById("doc-title").value;
const body = document.getElementById("doc-body").value;
if (!id) {
alert("Document ID is required.");
return;
}
const response = await fetch(`${BASE_URL}/index/${INDEX}/documents`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify([{ id, title, body }]),
});
const result = await response.json();
alert("Upload: " + JSON.stringify(result));
}
async function commitChanges() {
const response = await fetch(`${BASE_URL}/index/${INDEX}/commit`, { method: "POST" });
const result = await response.json();
alert("Commit: " + JSON.stringify(result));
}
function debouncedSearch() {
clearTimeout(debounceTimer);
debounceTimer = setTimeout(search, 300); // 300ms debounce
}
async function search() {
const query = document.getElementById("search-query").value;
const resBox = document.getElementById("results");
resBox.innerHTML = '';
if (!query) return;
const response = await fetch(`${BASE_URL}/index/${INDEX}/search`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
limit: 10,
offset: 0,
query: { fuzzy: { ctx: query } }
}),
});
const result = await response.json();
const hits = result?.data?.hits || [];
for (const hit of hits) {
const { title, body } = hit.doc;
const card = document.createElement("div");
card.className = "card";
const titleEl = document.createElement("h3");
titleEl.textContent = title || "Untitled";
const bodyEl = document.createElement("p");
bodyEl.textContent = body || "(no body)";
card.appendChild(titleEl);
card.appendChild(bodyEl);
resBox.appendChild(card);
}
}
</script>
</body>
</html>

77
site/v2/index.html Normal file
View file

@ -0,0 +1,77 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Searchcraft UI</title>
<script src="https://cdn.tailwindcss.com"></script>
</head>
<body class="bg-gray-900 text-white min-h-screen flex items-center justify-center px-4">
<div class="w-full max-w-2xl text-center">
<h1 class="text-3xl font-bold mb-6">🔍 Searchcraft</h1>
<input
id="search-query"
type="text"
placeholder="Start typing to search..."
class="w-full p-4 text-lg rounded-md bg-gray-800 border border-gray-700 focus:outline-none focus:ring-2 focus:ring-cyan-500"
oninput="debouncedSearch()"
/>
<div id="results" class="mt-8 space-y-4 text-left"></div>
</div>
<script>
const BASE_URL = "http://0.0.0.0:8000";
const INDEX = "thoughts-links";
let debounceTimer = null;
function debouncedSearch() {
clearTimeout(debounceTimer);
debounceTimer = setTimeout(search, 300);
}
async function search() {
const query = document.getElementById("search-query").value;
const resBox = document.getElementById("results");
resBox.innerHTML = '';
if (!query) return;
const response = await fetch(`${BASE_URL}/index/${INDEX}/search`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
limit: 10,
offset: 0,
query: { fuzzy: { ctx: query } }
}),
});
const result = await response.json();
const hits = result?.data?.hits || [];
for (const hit of hits) {
const { title, body, url } = hit.doc;
const preview = (body || "(no body)").slice(0, 1000) + ((body && body.length > 1000) ? "..." : "");
const link = url || "#";
const card = document.createElement("a");
card.href = link;
card.target = "_blank";
card.rel = "noopener noreferrer";
card.className = "block p-4 bg-gray-800 border border-gray-700 rounded-md hover:border-cyan-500 transition";
const titleEl = document.createElement("h3");
titleEl.className = "text-xl font-semibold text-cyan-400 mb-2";
titleEl.textContent = title || "Untitled";
const bodyEl = document.createElement("p");
bodyEl.className = "text-gray-300";
bodyEl.textContent = preview;
card.appendChild(titleEl);
card.appendChild(bodyEl);
resBox.appendChild(card);
}
}
</script>
</body>
</html>