From 060f998c598dd0a11744b39264a54fef2ae24c7e Mon Sep 17 00:00:00 2001 From: "Waylon S. Walker" Date: Wed, 3 Sep 2025 20:22:39 -0500 Subject: [PATCH] Initial Commit for rss-link-app Analyze links from rss feeds --- .gitignore | 966 ++++++++++++++++++++++++++++ README.md | 50 ++ main.py | 537 ++++++++++++++++ requirements.txt | 10 + static/styles.css | 84 +++ templates/components/host_card.html | 62 ++ templates/index.html | 104 +++ templates/layout.html | 24 + 8 files changed, 1837 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 static/styles.css create mode 100644 templates/components/host_card.html create mode 100644 templates/index.html create mode 100644 templates/layout.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e54723f --- /dev/null +++ b/.gitignore @@ -0,0 +1,966 @@ +# Created by https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode +# Edit at https://www.toptal.com/developers/gitignore?templates=vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode + +### Data ### +*.csv +*.dat +*.efx +*.gbr +*.key +*.pps +*.ppt +*.pptx +*.sdf +*.tax2010 +*.vcf +*.xml + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + + +### Executable ### +*.app +*.bat +*.cgi +*.com +*.exe +*.gadget +*.jar +*.pif +*.vb +*.wsf + +### Node ### +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* +.pnpm-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# Snowpack dependency directory (https://snowpack.dev/) +web_modules/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional stylelint cache +.stylelintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variable files +.env +.env.development.local +.env.test.local +.env.production.local +.env.local + +# parcel-bundler cache (https://parceljs.org/) +.cache +.parcel-cache + +# Next.js build output +.next +out + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# vuepress v2.x temp and cache directory +.temp + +# Docusaurus cache and generated files +.docusaurus + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + +# yarn v2 +.yarn/cache +.yarn/unplugged +.yarn/build-state.yml +.yarn/install-state.gz +.pnp.* + +### Node Patch ### +# Serverless Webpack directories +.webpack/ + +# Optional stylelint cache + +# SvelteKit build / generate output +.svelte-kit + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### SublimeText ### +# Cache files for Sublime Text +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache + +# Workspace files are user-specific +*.sublime-workspace + +# Project files should be checked into the repository, unless a significant +# proportion of contributors will probably not be using Sublime Text +# *.sublime-project + +# SFTP configuration file +sftp-config.json +sftp-config-alt*.json + +# Package control specific files +Package Control.last-run +Package Control.ca-list +Package Control.ca-bundle +Package Control.system-ca-bundle +Package Control.cache/ +Package Control.ca-certs/ +Package Control.merged-ca-bundle +Package Control.user-ca-bundle +oscrypto-ca-bundle.crt +bh_unicode_properties.cache + +# Sublime-github package stores a github token in this file +# https://packagecontrol.io/packages/sublime-github +GitHub.sublime-settings + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### VisualStudio ### +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. +## +## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore + +# User-specific files +*.rsuser +*.suo +*.user +*.userosscache +*.sln.docstates + +# User-specific files (MonoDevelop/Xamarin Studio) +*.userprefs + +# Mono auto generated files +mono_crash.* + +# Build results +[Dd]ebug/ +[Dd]ebugPublic/ +[Rr]elease/ +[Rr]eleases/ +x64/ +x86/ +[Ww][Ii][Nn]32/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# Visual Studio 2015/2017 cache/options directory +.vs/ +# Uncomment if you have tasks that create the project's static files in wwwroot +#wwwroot/ + +# Visual Studio 2017 auto generated files +Generated\ Files/ + +# MSTest test Results +[Tt]est[Rr]esult*/ +[Bb]uild[Ll]og.* + +# NUnit +*.VisualState.xml +TestResult.xml +nunit-*.xml + +# Build Results of an ATL Project +[Dd]ebugPS/ +[Rr]eleasePS/ +dlldata.c + +# Benchmark Results +BenchmarkDotNet.Artifacts/ + +# .NET Core +project.lock.json +project.fragment.lock.json +artifacts/ + +# ASP.NET Scaffolding +ScaffoldingReadMe.txt + +# StyleCop +StyleCopReport.xml + +# Files built by Visual Studio +*_i.c +*_p.c +*_h.h +*.ilk +*.meta +*.obj +*.iobj +*.pch +*.pdb +*.ipdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.tmp_proj +*_wpftmp.csproj +*.tlog +*.vspscc +*.vssscc +.builds +*.pidb +*.svclog +*.scc + +# Chutzpah Test files +_Chutzpah* + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opendb +*.opensdf +*.cachefile +*.VC.db +*.VC.VC.opendb + +# Visual Studio profiler +*.psess +*.vsp +*.vspx +*.sap + +# Visual Studio Trace Files +*.e2e + +# TFS 2012 Local Workspace +$tf/ + +# Guidance Automation Toolkit +*.gpState + +# ReSharper is a .NET coding add-in +_ReSharper*/ +*.[Rr]e[Ss]harper +*.DotSettings.user + +# TeamCity is a build add-in +_TeamCity* + +# DotCover is a Code Coverage Tool +*.dotCover + +# AxoCover is a Code Coverage Tool +.axoCover/* +!.axoCover/settings.json + +# Coverlet is a free, cross platform Code Coverage Tool +coverage*.json +coverage*.xml +coverage*.info + +# Visual Studio code coverage results +*.coverage +*.coveragexml + +# NCrunch +_NCrunch_* +.*crunch*.local.xml +nCrunchTemp_* + +# MightyMoose +*.mm.* +AutoTest.Net/ + +# Web workbench (sass) +.sass-cache/ + +# Installshield output folder +[Ee]xpress/ + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish/ + +# Publish Web Output +*.[Pp]ublish.xml +*.azurePubxml +# Note: Comment the next line if you want to checkin your web deploy settings, +# but database connection strings (with potential passwords) will be unencrypted +*.pubxml +*.publishproj + +# Microsoft Azure Web App publish settings. Comment the next line if you want to +# checkin your Azure Web App publish settings, but sensitive information contained +# in these scripts will be unencrypted +PublishScripts/ + +# NuGet Packages +*.nupkg +# NuGet Symbol Packages +*.snupkg +# The packages folder can be ignored because of Package Restore +**/[Pp]ackages/* +# except build/, which is used as an MSBuild target. +!**/[Pp]ackages/build/ +# Uncomment if necessary however generally it will be regenerated when needed +#!**/[Pp]ackages/repositories.config +# NuGet v3's project.json files produces more ignorable files +*.nuget.props +*.nuget.targets + +# Microsoft Azure Build Output +csx/ +*.build.csdef + +# Microsoft Azure Emulator +ecf/ +rcf/ + +# Windows Store app package directories and files +AppPackages/ +BundleArtifacts/ +Package.StoreAssociation.xml +_pkginfo.txt +*.appx +*.appxbundle +*.appxupload + +# Visual Studio cache files +# files ending in .cache can be ignored +*.[Cc]ache +# but keep track of directories ending in .cache +!?*.[Cc]ache/ + +# Others +ClientBin/ +~$* +*.dbmdl +*.dbproj.schemaview +*.jfm +*.pfx +*.publishsettings +orleans.codegen.cs + +# Including strong name files can present a security risk +# (https://github.com/github/gitignore/pull/2483#issue-259490424) +#*.snk + +# Since there are multiple workflows, uncomment next line to ignore bower_components +# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) +#bower_components/ + +# RIA/Silverlight projects +Generated_Code/ + +# Backup & report files from converting an old project file +# to a newer Visual Studio version. Backup files are not needed, +# because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML +UpgradeLog*.htm +ServiceFabricBackup/ +*.rptproj.bak + +# SQL Server files +*.mdf +*.ldf +*.ndf + +# Business Intelligence projects +*.rdl.data +*.bim.layout +*.bim_*.settings +*.rptproj.rsuser +*- [Bb]ackup.rdl +*- [Bb]ackup ([0-9]).rdl +*- [Bb]ackup ([0-9][0-9]).rdl + +# Microsoft Fakes +FakesAssemblies/ + +# GhostDoc plugin setting file +*.GhostDoc.xml + +# Node.js Tools for Visual Studio +.ntvs_analysis.dat + +# Visual Studio 6 build log +*.plg + +# Visual Studio 6 workspace options file +*.opt + +# Visual Studio 6 auto-generated workspace file (contains which files were open etc.) +*.vbw + +# Visual Studio 6 auto-generated project file (contains which files were open etc.) +*.vbp + +# Visual Studio 6 workspace and project file (working project files containing files to include in project) +*.dsw +*.dsp + +# Visual Studio 6 technical files + +# Visual Studio LightSwitch build output +**/*.HTMLClient/GeneratedArtifacts +**/*.DesktopClient/GeneratedArtifacts +**/*.DesktopClient/ModelManifest.xml +**/*.Server/GeneratedArtifacts +**/*.Server/ModelManifest.xml +_Pvt_Extensions + +# Paket dependency manager +.paket/paket.exe +paket-files/ + +# FAKE - F# Make +.fake/ + +# CodeRush personal settings +.cr/personal + +# Python Tools for Visual Studio (PTVS) +*.pyc + +# Cake - Uncomment if you are using it +# tools/** +# !tools/packages.config + +# Tabs Studio +*.tss + +# Telerik's JustMock configuration file +*.jmconfig + +# BizTalk build output +*.btp.cs +*.btm.cs +*.odx.cs +*.xsd.cs + +# OpenCover UI analysis results +OpenCover/ + +# Azure Stream Analytics local run output +ASALocalRun/ + +# MSBuild Binary and Structured Log +*.binlog + +# NVidia Nsight GPU debugger configuration file +*.nvuser + +# MFractors (Xamarin productivity tool) working folder +.mfractor/ + +# Local History for Visual Studio +.localhistory/ + +# Visual Studio History (VSHistory) files +.vshistory/ + +# BeatPulse healthcheck temp database +healthchecksdb + +# Backup folder for Package Reference Convert tool in Visual Studio 2017 +MigrationBackup/ + +# Ionide (cross platform F# VS Code tools) working folder +.ionide/ + +# Fody - auto-generated XML schema +FodyWeavers.xsd + +# VS Code files for those working on multiple tools +*.code-workspace + +# Local History for Visual Studio Code + +# Windows Installer files from build outputs +*.cab +*.msi +*.msix +*.msm +*.msp + +# JetBrains Rider +*.sln.iml + +### VisualStudio Patch ### +# Additional files built by Visual Studio + +# End of https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode + +*.db diff --git a/README.md b/README.md new file mode 100644 index 0000000..5e45be8 --- /dev/null +++ b/README.md @@ -0,0 +1,50 @@ +# RSS Link Audit (FastAPI) + +A FastAPI app that accepts an RSS/Atom feed URL, fetches each post’s full HTML, extracts outbound links, groups them by hostname, **hunts for each host’s RSS feed** (common endpoints + homepage discovery), and renders a stylish report using the **Royal Armory** palette. + +## Features + +- Input a feed URL via UI or JSON. +- Concurrent fetching (httpx + asyncio). +- Extract links from each post page. +- Group by hostname; count occurrences. +- Heuristic RSS discovery: + - Probe common feed endpoints (e.g. `/feed`, `/rss.xml`, `/atom.xml`, etc.). + - Parse homepage `` for RSS/Atom. + - Scan homepage `` tags for `rss|atom|feed`. + - Validate candidates with `feedparser`. +- Report UI: + - Per-host card with counts. + - **Bar** visual for how many links a host has. + - **Top links** (if mentioned > 1). + - Links list truncated with a **More** button. + - RSS/Atom badge if found. + +## Run locally + +```bash +python -m venv .venv +source .venv/bin/activate # Windows: .venv\Scripts\activate +pip install -r requirements.txt +uvicorn main:app --reload +``` + +Open: http://127.0.0.1:8000 + +## API + +``` +POST /api/analyze +Content-Type: application/json + +{"feed_url": "https://example.com/feed.xml"} +``` + +Returns JSON with the summarized data. + +## Notes / Caveats + +- Only static HTML is parsed (no JS rendering). +- Some sites block bots; results may vary. +- For large feeds, you may wish to trim the number of posts (e.g., slice `post_urls` in `analyze_feed`). +- Consider adding caching (e.g., `aiocache`, Redis) if you’ll run this frequently. diff --git a/main.py b/main.py new file mode 100644 index 0000000..7cca3a7 --- /dev/null +++ b/main.py @@ -0,0 +1,537 @@ +# main.py (v1.2) — robust feed parsing, clearer SSE progress, normalized host caching, concurrent discovery +import asyncio +import json +import uuid +from collections import Counter +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set, Tuple +from urllib.parse import urljoin, urlparse +from contextlib import asynccontextmanager + +import httpx +import feedparser +from bs4 import BeautifulSoup + +from fastapi import FastAPI, Request, Form, HTTPException +from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates + +from sqlmodel import Field, SQLModel, create_engine, Session, select +from datetime import datetime, timezone + + +# ------------------------------ +# Settings / Constants +# ------------------------------ +REQUEST_TIMEOUT = httpx.Timeout(15.0, connect=8.0, read=15.0) +HEADERS = { + "User-Agent": "LinkAuditBot/1.2 (+https://example.com; contact: admin@example.com)" +} + +COMMON_FEED_PATHS = [ + "/feed", "/feed/", "/feed.xml", + "/rss", "/rss.xml", "/rss/", + "/atom", "/atom.xml", + "/index.xml", + "/blog/feed", "/blog/rss", "/blog/rss.xml", "/blog/index.xml", + "/feeds/posts/default?alt=rss", # Blogger + "/news/atom.xml", "/news/rss.xml", + "/.rss", "/?feed=rss2", # WP variants + "/category/news/feed", "/?feed=atom", +] + +FEED_MIME_HINTS = { + "application/rss+xml", + "application/atom+xml", + "application/xml", + "text/xml", +} + +DISCOVERY_CONCURRENCY = 10 + + +# ------------------------------ +# Database Models (SQLModel) +# ------------------------------ +class PageCache(SQLModel, table=True): + url: str = Field(primary_key=True) + html: Optional[str] = None + fetched_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + +class LinksCache(SQLModel, table=True): + url: str = Field(primary_key=True) + links_json: str # JSON list[str] + extracted_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + +class HostFeedCache(SQLModel, table=True): + hostname: str = Field(primary_key=True) # normalized! + feed_url: Optional[str] = None + checked_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + +class FeedRun(SQLModel, table=True): + id: str = Field(primary_key=True, default_factory=lambda: str(uuid.uuid4())) + feed_url: str + started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + finished_at: Optional[datetime] = None + summary_json: Optional[str] = None # store last summary, if desired + + +engine = create_engine("sqlite:///cache.db", echo=False) +SQLModel.metadata.create_all(engine) + + +# ------------------------------ +# Data models +# ------------------------------ +@dataclass +class HostSummary: + hostname: str + count: int = 0 + unique_links: Set[str] = field(default_factory=set) + link_counts: Counter = field(default_factory=Counter) + feed_url: Optional[str] = None + + +# ------------------------------ +# Utilities +# ------------------------------ +def now_utc() -> datetime: + return datetime.now(timezone.utc) + +def normalize_host(host: str) -> str: + if not host: + return host + h = host.strip().lower().rstrip(".") + if h.startswith("www."): + h = h[4:] + return h + +def is_http_url(href: str) -> bool: + try: + p = urlparse(href) + return p.scheme in ("http", "https") + except Exception: + return False + +def absolutize(href: str, base_url: str) -> Optional[str]: + if not href: + return None + if href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"): + return None + try: + abs_url = urljoin(base_url, href) + if is_http_url(abs_url): + return abs_url + except Exception: + return None + return None + +def extract_links_from_html(html: str, base_url: str) -> List[str]: + soup = BeautifulSoup(html, "lxml") + links: List[str] = [] + for a in soup.find_all("a", href=True): + u = absolutize(a.get("href"), base_url) + if u: + links.append(u) + return links + + +# ------------------------------ +# Networking +# ------------------------------ +async def fetch_text(client: httpx.AsyncClient, url: str) -> Optional[str]: + try: + r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True) + if r.status_code < 400: + # Do NOT force UTF-8; respect server if present + if r.encoding is None: + r.encoding = r.apparent_encoding or "utf-8" + return r.text + except Exception: + return None + return None + +async def fetch_bytes(client: httpx.AsyncClient, url: str) -> Optional[Tuple[bytes, Optional[str]]]: + try: + r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True) + if r.status_code < 400: + ctype = r.headers.get("content-type") + return r.content, ctype + except Exception: + return None + return None + +async def fetch_head_ok(client: httpx.AsyncClient, url: str) -> Tuple[bool, Optional[str]]: + try: + r = await client.head(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True) + if r.status_code < 400: + return True, r.headers.get("content-type") + except Exception: + pass + try: + r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True) + if r.status_code < 400: + return True, r.headers.get("content-type") + except Exception: + pass + return False, None + + +# ------------------------------ +# Cache helpers +# ------------------------------ +def cache_get_page(url: str) -> Optional[PageCache]: + with Session(engine) as sess: + return sess.get(PageCache, url) + +def cache_set_page(url: str, html: Optional[str]): + with Session(engine) as sess: + sess.merge(PageCache(url=url, html=html, fetched_at=now_utc())) + sess.commit() + +def cache_get_links(url: str) -> Optional[List[str]]: + with Session(engine) as sess: + row = sess.get(LinksCache, url) + if not row: + return None + try: + return json.loads(row.links_json) + except Exception: + return None + +def cache_set_links(url: str, links: List[str]): + with Session(engine) as sess: + sess.merge(LinksCache(url=url, links_json=json.dumps(links), extracted_at=now_utc())) + sess.commit() + +def cache_get_host_feed(hostname: str) -> Optional[str]: + host_key = normalize_host(hostname) + with Session(engine) as sess: + row = sess.get(HostFeedCache, host_key) + return row.feed_url if row else None + +def cache_set_host_feed(hostname: str, feed_url: Optional[str]): + host_key = normalize_host(hostname) + with Session(engine) as sess: + sess.merge(HostFeedCache(hostname=host_key, feed_url=feed_url, checked_at=now_utc())) + sess.commit() + + +# ------------------------------ +# Cached fetch/extract +# ------------------------------ +async def fetch_page_html(client: httpx.AsyncClient, url: str) -> Optional[str]: + cached = cache_get_page(url) + if cached and cached.html: + return cached.html + html = await fetch_text(client, url) + cache_set_page(url, html) + return html + +async def get_links_for_page(client: httpx.AsyncClient, url: str) -> List[str]: + cached = cache_get_links(url) + if cached is not None: + return cached + html = await fetch_page_html(client, url) + if not html: + cache_set_links(url, []) + return [] + links = extract_links_from_html(html, url) + cache_set_links(url, links) + return links + + +# ------------------------------ +# Robust feed parsing +# ------------------------------ +async def fetch_feed_entries(client: httpx.AsyncClient, feed_url: str) -> List[str]: + """ + Fetch feed as bytes and let feedparser infer encoding using headers. + Retry a couple fallbacks for mismatched declarations. + """ + got = await fetch_bytes(client, feed_url) + if not got: + raise ValueError("Could not download the feed.") + content, ctype = got + + parsed = feedparser.parse(content) + if parsed.bozo == 0 and (parsed.feed or parsed.entries): + return _entries_to_urls(parsed) + # Fallback 1: strip BOM + cleaned = content.lstrip(b"\xef\xbb\xbf") + if cleaned is not content: + parsed2 = feedparser.parse(cleaned) + if parsed2.bozo == 0 and (parsed2.feed or parsed2.entries): + return _entries_to_urls(parsed2) + # Fallback 2: replace us-ascii decl with utf-8 + try: + cleaned2 = cleaned.replace(b'encoding="us-ascii"', b'encoding="utf-8"') + parsed3 = feedparser.parse(cleaned2) + if parsed3.bozo == 0 and (parsed3.feed or parsed3.entries): + return _entries_to_urls(parsed3) + except Exception: + pass + raise ValueError(f"Could not parse feed: {getattr(parsed, 'bozo_exception', 'unknown parse error')}") + +def _entries_to_urls(parsed) -> List[str]: + urls: List[str] = [] + for e in parsed.entries: + if getattr(e, "link", None): + urls.append(e.link) + elif getattr(e, "id", None) and is_http_url(e.id): + urls.append(e.id) + seen, out = set(), [] + for u in urls: + if u not in seen: + seen.add(u) + out.append(u) + return out + + +# ------------------------------ +# Feed discovery (normalized + concurrent) +# ------------------------------ +async def discover_feed_for_host(client: httpx.AsyncClient, hostname: str) -> Optional[str]: + host_key = normalize_host(hostname) + cached = cache_get_host_feed(host_key) + if cached is not None: + return cached + + bases = [] + canon = host_key + bases.append(f"https://{canon}") + bases.append(f"http://{canon}") + if not canon.startswith("www."): + bases.append(f"https://www.{canon}") + bases.append(f"http://www.{canon}") + + async def try_candidate(url: str) -> Optional[str]: + ok, ctype = await fetch_head_ok(client, url) + if ok and (not ctype or any(mt in ctype for mt in FEED_MIME_HINTS)): + parsed = feedparser.parse(url) + if parsed.bozo == 0 and (parsed.feed or parsed.entries): + return url + return None + + tasks = [] + for base in bases: + for path in COMMON_FEED_PATHS: + tasks.append(asyncio.create_task(try_candidate(base + path))) + for t in asyncio.as_completed(tasks): + res = await t + if res: + cache_set_host_feed(host_key, res) + return res + + for base in bases: + html = await fetch_page_html(client, base + "/") + if not html: + continue + soup = BeautifulSoup(html, "lxml") + for link in soup.find_all("link", rel=True, href=True): + rels = link.get("rel") + if isinstance(rels, list): + rels = {r.lower() for r in rels if r} + else: + rels = {str(rels).lower()} + typ = str(link.get("type", "")).lower() + href = link.get("href") + if "alternate" in rels and any(mt in typ for mt in ("rss", "atom", "xml")): + feed_url = urljoin(base + "/", href) + parsed = feedparser.parse(feed_url) + if parsed.bozo == 0 and (parsed.feed or parsed.entries): + cache_set_host_feed(host_key, feed_url) + return feed_url + for a in soup.find_all("a", href=True): + href = a.get("href", "") + if any(tok in href.lower() for tok in ("rss", "atom", "feed")): + feed_url = urljoin(base + "/", href) + ok, ctype = await fetch_head_ok(client, feed_url) + if ok: + parsed = feedparser.parse(feed_url) + if parsed.bozo == 0 and (parsed.feed or parsed.entries): + cache_set_host_feed(host_key, feed_url) + return feed_url + + cache_set_host_feed(host_key, None) + return None + + +# ------------------------------ +# SSE plumbing +# ------------------------------ +class Job: + def __init__(self, feed_url: str): + self.id = str(uuid.uuid4()) + self.feed_url = feed_url + self.queue: asyncio.Queue[str] = asyncio.Queue() + self.done = asyncio.Event() + + async def emit(self, event: str, data: dict): + payload = {"event": event, "data": data, "ts": datetime.now(timezone.utc).isoformat()} + await self.queue.put(f"event: {event}\ndata: {json.dumps(payload)}\n\n") + + async def finish(self): + self.done.set() + await self.queue.put("event: done\ndata: {}\n\n") + + +JOBS: Dict[str, Job] = {} + + +async def run_analysis_job(job: Job): + with Session(engine) as sess: + fr = FeedRun(feed_url=job.feed_url) + sess.add(fr) + sess.commit() + + async with httpx.AsyncClient(http2=True) as client: + try: + await job.emit("status", {"stage": "feed", "message": "Downloading and parsing feed…"}) + post_urls = await fetch_feed_entries(client, job.feed_url) + await job.emit("posts", {"count": len(post_urls)}) + + all_links: List[str] = [] + for idx, post_url in enumerate(post_urls, start=1): + await job.emit("status", {"stage": "posts", "message": f"Fetching post {idx}/{len(post_urls)}"}) + links = await get_links_for_page(client, post_url) + all_links.extend(links) + await job.emit("post_progress", {"current": idx, "total": len(post_urls), "post_url": post_url}) + + host_map: Dict[str, HostSummary] = {} + for link in all_links: + host = normalize_host(urlparse(link).netloc) + if not host: + continue + hs = host_map.setdefault(host, HostSummary(hostname=host)) + hs.count += 1 + hs.unique_links.add(link) + hs.link_counts[link] += 1 + + hosts_sorted = sorted(host_map.values(), key=lambda s: s.count, reverse=True) + await job.emit("hosts", {"count": len(hosts_sorted)}) + + sem = asyncio.Semaphore(DISCOVERY_CONCURRENCY) + max_count = max((h.count for h in hosts_sorted), default=1) + + async def work(hs: HostSummary, idx: int, total: int): + async with sem: + await job.emit("status", {"stage": "discover", "message": f"Discovering feed for {hs.hostname} ({idx}/{total})"}) + feed = await discover_feed_for_host(client, hs.hostname) + hs.feed_url = feed + host_dict = { + "hostname": hs.hostname, + "count": hs.count, + "unique_link_count": len(hs.unique_links), + "links": sorted(list(hs.unique_links)), + "top_links": [ + {"url": url, "count": cnt} + for url, cnt in hs.link_counts.most_common() + if cnt > 1 + ], + "feed_url": hs.feed_url, + } + html = render_host_card(host_dict, max_count, index=idx) + await job.emit("host_card", {"html": html, "index": idx, "total": total}) + + tasks = [asyncio.create_task(work(hs, i, len(hosts_sorted))) for i, hs in enumerate(hosts_sorted, start=1)] + async def heartbeat(): + while any(not t.done() for t in tasks): + await job.emit("status", {"stage": "discover", "message": "Still discovering host feeds…"}) + await asyncio.sleep(3) + hb = asyncio.create_task(heartbeat()) + await asyncio.gather(*tasks) + hb.cancel() + + summary = { + "feed_url": job.feed_url, + "post_count": len(post_urls), + "hosts": [h.hostname for h in hosts_sorted], + "fetched_at": datetime.now(timezone.utc).isoformat(), + } + with Session(engine) as sess: + fr = sess.exec(select(FeedRun).where(FeedRun.feed_url == job.feed_url).order_by(FeedRun.started_at.desc())).first() + if fr: + fr.summary_json = json.dumps(summary) + fr.finished_at = datetime.now(timezone.utc) + sess.add(fr) + sess.commit() + + await job.emit("summary", summary) + except Exception as e: + await job.emit("error", {"message": str(e)}) + finally: + await job.finish() + + +# ------------------------------ +# Template rendering for components +# ------------------------------ +templates = Jinja2Templates(directory="templates") + +def render_host_card(host: dict, max_count: int, index: int) -> str: + from fastapi import Request + class Dummy: + def __init__(self): self.state = type("s", (), {})() + req = Dummy() + html = templates.get_template("components/host_card.html").render( + request=req, host=host, max_count=max_count, index=index + ) + return html + + +# ------------------------------ +# FastAPI app + routes +# ------------------------------ +app = FastAPI(title="RSS Link Audit", version="1.2.0") +app.mount("/static", StaticFiles(directory="static"), name="static") + + +@app.get("/", response_class=HTMLResponse) +async def index(request: Request): + return templates.TemplateResponse("index.html", {"request": request}) + + +@app.post("/start", response_class=JSONResponse) +async def start(feed_url: str = Form(...)): + job = Job(feed_url) + JOBS[job.id] = job + asyncio.create_task(run_analysis_job(job)) + return {"job_id": job.id} + + +@app.get("/events/{job_id}") +async def sse(job_id: str): + job = JOBS.get(job_id) + if not job: + raise HTTPException(404, "Job not found") + + async def event_gen(): + yield f"event: hello\ndata: {{\"job_id\":\"{job.id}\"}}\n\n" + while True: + try: + item = await asyncio.wait_for(job.queue.get(), timeout=30.0) + yield item + if job.done.is_set(): + break + except asyncio.TimeoutError: + yield "event: ping\ndata: {}\n\n" + JOBS.pop(job.id, None) + + return StreamingResponse(event_gen(), media_type="text/event-stream") + + +@app.post("/api/analyze", response_class=JSONResponse) +async def analyze_api(payload: Dict): + feed_url = payload.get("feed_url") + if not feed_url: + raise HTTPException(status_code=400, detail="Missing 'feed_url'") + job = Job(feed_url) + await run_analysis_job(job) + return JSONResponse(content={"ok": True}) + + +@app.get("/healthz") +async def healthz(): + return {"ok": True} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3cc9ad3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +fastapi +uvicorn[standard] +httpx +feedparser +beautifulsoup4 +lxml +jinja2 +sqlmodel +aiosqlite +sqlalchemy>=2.0 diff --git a/static/styles.css b/static/styles.css new file mode 100644 index 0000000..e7aa7a9 --- /dev/null +++ b/static/styles.css @@ -0,0 +1,84 @@ +/* Royal Armory Palette */ +:root { + --ra-ink: #000030; + --ra-plum: #3f0a57; + --ra-magenta: #85106b; + --ra-ruby: #b02c2c; + --ra-bronze: #b8673e; + --ra-amber: #d9932b; + --ra-gold: #f0bd71; + --ra-cream: #ffe3ba; + + --ra-bg: var(--ra-ink); + --ra-panel: #0b0b3f; + --ra-copper: #6f3b2b; + --ra-ruby-dark: #8c2323; +} + +* { box-sizing: border-box; } +html, body { + margin: 0; + background: var(--ra-bg); + color: var(--ra-cream); + font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, Arial; + line-height: 1.5; +} + +a { color: var(--ra-gold); text-decoration: none; } +a:hover { text-decoration: underline; } + +header, footer { background: linear-gradient(0deg, rgba(64,10,87,0.25), rgba(64,10,87,0.25)); } +main { padding: 1rem; } + +.link { word-break: break-all; text-underline-offset: 3px; } + +.bar-wrap { width: 100%; background: rgba(240,189,113,0.12); height: 12px; } +.bar { height: 12px; background: linear-gradient(90deg, var(--ra-amber), var(--ra-gold)); } + +.btn-more { background: var(--ra-plum); padding: 6px 10px; border-radius: 10px; font-weight: 600; color: var(--ra-cream); } + +.more-list[data-expanded="false"] { display: none; } +.more-list[data-expanded="true"] { display: block; } + +/* utilities */ +.max-w-3xl { max-width: 48rem; } +.max-w-5xl { max-width: 64rem; } +.mx-auto { margin-left: auto; margin-right: auto; } +.p-6 { padding: 1.5rem; } +.p-5 { padding: 1.25rem; } +.p-4 { padding: 1rem; } +.px-6 { padding-left: 1.5rem; padding-right: 1.5rem; } +.py-4 { padding-top: 1rem; padding-bottom: 1rem; } +.py-10{ padding-top: 2.5rem; padding-bottom: 2.5rem; } +.mt-2 { margin-top: 0.5rem; } +.mt-4 { margin-top: 1rem; } +.mt-6 { margin-top: 1.5rem; } +.mb-1 { margin-bottom: 0.25rem; } +.mb-2 { margin-bottom: 0.5rem; } +.space-y-1 > * + * { margin-top: 0.25rem; } +.space-y-6 > * + * { margin-top: 1.5rem; } +.rounded-2xl { border-radius: 1rem; } +.rounded-xl { border-radius: 0.75rem; } +.shadow { box-shadow: 0 10px 30px rgba(0,0,0,0.25); } +.font-bold { font-weight: 700; } +.font-semibold { font-weight: 600; } +.text-sm { font-size: 0.875rem; } +.text-xl { font-size: 1.25rem; } +.text-3xl { font-size: 1.875rem; } +.opacity-70 { opacity: 0.7; } +.opacity-80 { opacity: 0.8; } +.border { border-width: 1px; } +.border-b { border-bottom-width: 1px; } +.flex { display: flex; } +.items-center { align-items: center; } +.items-baseline { align-items: baseline; } +.justify-between { justify-content: space-between; } +.gap-2 { gap: 0.5rem; } +.gap-4 { gap: 1rem; } +.min-w-0 { min-width: 0; } +.shrink-0 { flex-shrink: 0; } +.break-all { word-break: break-all; } +.w-3 { width: 0.75rem; } +.h-3 { height: 0.75rem; } +input, button { border: none; } +button { cursor: pointer; } diff --git a/templates/components/host_card.html b/templates/components/host_card.html new file mode 100644 index 0000000..f2d4b3e --- /dev/null +++ b/templates/components/host_card.html @@ -0,0 +1,62 @@ +{% set pct = (100 * host.count // (max_count or 1)) %} +
+
+
+

{{ host.hostname }}

+
+ Links: {{ host.count }} + Unique: {{ host.unique_link_count }} +
+
+ {% if host.feed_url %} +
+ RSS / Atom + + {% endif %} +
+ +
+
+
+ +
+ {% if host.top_links %} +
+
Top links (mentioned > 1):
+
    + {% for tl in host.top_links %} +
  • + {{ tl.count }} + {{ tl.url }} +
  • + {% endfor %} +
+
+ {% endif %} + + {% set list_id = "links-" ~ index %} + {% set links = host.links %} + {% set preview = links[:8] %} + {% set remainder = links[8:] %} +
+
Links:
+
    + {% for url in preview %} +
  • {{ url }}
  • + {% endfor %} +
+ + {% if remainder %} +
+
    + {% for url in remainder %} +
  • {{ url }}
  • + {% endfor %} +
+
+ + {% endif %} +
+
+
diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..abb68ae --- /dev/null +++ b/templates/index.html @@ -0,0 +1,104 @@ +{% extends "layout.html" %} +{% block content %} +
+

RSS Link Audit

+

Paste a feed URL. This version uses SQLite/SQLModel caching and streams progress over SSE.

+ +
+ + +
+ +
+ +
+
+
+ + +{% endblock %} diff --git a/templates/layout.html b/templates/layout.html new file mode 100644 index 0000000..b729e2b --- /dev/null +++ b/templates/layout.html @@ -0,0 +1,24 @@ + + + + + + RSS Link Audit + + + +
+
+
+ RSS Link Audit + with SQLite cache + SSE +
+
+
+ {% block content %}{% endblock %} +
+ + +