Initial Commit for rss-link-app
Analyze links from rss feeds
This commit is contained in:
commit
060f998c59
8 changed files with 1837 additions and 0 deletions
966
.gitignore
vendored
Normal file
966
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,966 @@
|
|||
# Created by https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
|
||||
|
||||
### Data ###
|
||||
*.csv
|
||||
*.dat
|
||||
*.efx
|
||||
*.gbr
|
||||
*.key
|
||||
*.pps
|
||||
*.ppt
|
||||
*.pptx
|
||||
*.sdf
|
||||
*.tax2010
|
||||
*.vcf
|
||||
*.xml
|
||||
|
||||
### Emacs ###
|
||||
# -*- mode: gitignore; -*-
|
||||
*~
|
||||
\#*\#
|
||||
/.emacs.desktop
|
||||
/.emacs.desktop.lock
|
||||
*.elc
|
||||
auto-save-list
|
||||
tramp
|
||||
.\#*
|
||||
|
||||
# Org-mode
|
||||
.org-id-locations
|
||||
*_archive
|
||||
|
||||
# flymake-mode
|
||||
*_flymake.*
|
||||
|
||||
# eshell files
|
||||
/eshell/history
|
||||
/eshell/lastdir
|
||||
|
||||
# elpa packages
|
||||
/elpa/
|
||||
|
||||
# reftex files
|
||||
*.rel
|
||||
|
||||
# AUCTeX auto folder
|
||||
/auto/
|
||||
|
||||
# cask packages
|
||||
.cask/
|
||||
dist/
|
||||
|
||||
# Flycheck
|
||||
flycheck_*.el
|
||||
|
||||
# server auth directory
|
||||
/server/
|
||||
|
||||
# projectiles files
|
||||
.projectile
|
||||
|
||||
# directory configuration
|
||||
.dir-locals.el
|
||||
|
||||
# network security
|
||||
/network-security.data
|
||||
|
||||
|
||||
### Executable ###
|
||||
*.app
|
||||
*.bat
|
||||
*.cgi
|
||||
*.com
|
||||
*.exe
|
||||
*.gadget
|
||||
*.jar
|
||||
*.pif
|
||||
*.vb
|
||||
*.wsf
|
||||
|
||||
### Node ###
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
lerna-debug.log*
|
||||
.pnpm-debug.log*
|
||||
|
||||
# Diagnostic reports (https://nodejs.org/api/report.html)
|
||||
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
*.lcov
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
jspm_packages/
|
||||
|
||||
# Snowpack dependency directory (https://snowpack.dev/)
|
||||
web_modules/
|
||||
|
||||
# TypeScript cache
|
||||
*.tsbuildinfo
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional stylelint cache
|
||||
.stylelintcache
|
||||
|
||||
# Microbundle cache
|
||||
.rpt2_cache/
|
||||
.rts2_cache_cjs/
|
||||
.rts2_cache_es/
|
||||
.rts2_cache_umd/
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variable files
|
||||
.env
|
||||
.env.development.local
|
||||
.env.test.local
|
||||
.env.production.local
|
||||
.env.local
|
||||
|
||||
# parcel-bundler cache (https://parceljs.org/)
|
||||
.cache
|
||||
.parcel-cache
|
||||
|
||||
# Next.js build output
|
||||
.next
|
||||
out
|
||||
|
||||
# Nuxt.js build / generate output
|
||||
.nuxt
|
||||
dist
|
||||
|
||||
# Gatsby files
|
||||
.cache/
|
||||
# Comment in the public line in if your project uses Gatsby and not Next.js
|
||||
# https://nextjs.org/blog/next-9-1#public-directory-support
|
||||
# public
|
||||
|
||||
# vuepress build output
|
||||
.vuepress/dist
|
||||
|
||||
# vuepress v2.x temp and cache directory
|
||||
.temp
|
||||
|
||||
# Docusaurus cache and generated files
|
||||
.docusaurus
|
||||
|
||||
# Serverless directories
|
||||
.serverless/
|
||||
|
||||
# FuseBox cache
|
||||
.fusebox/
|
||||
|
||||
# DynamoDB Local files
|
||||
.dynamodb/
|
||||
|
||||
# TernJS port file
|
||||
.tern-port
|
||||
|
||||
# Stores VSCode versions used for testing VSCode extensions
|
||||
.vscode-test
|
||||
|
||||
# yarn v2
|
||||
.yarn/cache
|
||||
.yarn/unplugged
|
||||
.yarn/build-state.yml
|
||||
.yarn/install-state.gz
|
||||
.pnp.*
|
||||
|
||||
### Node Patch ###
|
||||
# Serverless Webpack directories
|
||||
.webpack/
|
||||
|
||||
# Optional stylelint cache
|
||||
|
||||
# SvelteKit build / generate output
|
||||
.svelte-kit
|
||||
|
||||
### PyCharm ###
|
||||
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
|
||||
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
|
||||
|
||||
# User-specific stuff
|
||||
.idea/**/workspace.xml
|
||||
.idea/**/tasks.xml
|
||||
.idea/**/usage.statistics.xml
|
||||
.idea/**/dictionaries
|
||||
.idea/**/shelf
|
||||
|
||||
# AWS User-specific
|
||||
.idea/**/aws.xml
|
||||
|
||||
# Generated files
|
||||
.idea/**/contentModel.xml
|
||||
|
||||
# Sensitive or high-churn files
|
||||
.idea/**/dataSources/
|
||||
.idea/**/dataSources.ids
|
||||
.idea/**/dataSources.local.xml
|
||||
.idea/**/sqlDataSources.xml
|
||||
.idea/**/dynamic.xml
|
||||
.idea/**/uiDesigner.xml
|
||||
.idea/**/dbnavigator.xml
|
||||
|
||||
# Gradle
|
||||
.idea/**/gradle.xml
|
||||
.idea/**/libraries
|
||||
|
||||
# Gradle and Maven with auto-import
|
||||
# When using Gradle or Maven with auto-import, you should exclude module files,
|
||||
# since they will be recreated, and may cause churn. Uncomment if using
|
||||
# auto-import.
|
||||
# .idea/artifacts
|
||||
# .idea/compiler.xml
|
||||
# .idea/jarRepositories.xml
|
||||
# .idea/modules.xml
|
||||
# .idea/*.iml
|
||||
# .idea/modules
|
||||
# *.iml
|
||||
# *.ipr
|
||||
|
||||
# CMake
|
||||
cmake-build-*/
|
||||
|
||||
# Mongo Explorer plugin
|
||||
.idea/**/mongoSettings.xml
|
||||
|
||||
# File-based project format
|
||||
*.iws
|
||||
|
||||
# IntelliJ
|
||||
out/
|
||||
|
||||
# mpeltonen/sbt-idea plugin
|
||||
.idea_modules/
|
||||
|
||||
# JIRA plugin
|
||||
atlassian-ide-plugin.xml
|
||||
|
||||
# Cursive Clojure plugin
|
||||
.idea/replstate.xml
|
||||
|
||||
# SonarLint plugin
|
||||
.idea/sonarlint/
|
||||
|
||||
# Crashlytics plugin (for Android Studio and IntelliJ)
|
||||
com_crashlytics_export_strings.xml
|
||||
crashlytics.properties
|
||||
crashlytics-build.properties
|
||||
fabric.properties
|
||||
|
||||
# Editor-based Rest Client
|
||||
.idea/httpRequests
|
||||
|
||||
# Android studio 3.1+ serialized cache file
|
||||
.idea/caches/build_file_checksums.ser
|
||||
|
||||
### PyCharm Patch ###
|
||||
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
|
||||
|
||||
# *.iml
|
||||
# modules.xml
|
||||
# .idea/misc.xml
|
||||
# *.ipr
|
||||
|
||||
# Sonarlint plugin
|
||||
# https://plugins.jetbrains.com/plugin/7973-sonarlint
|
||||
.idea/**/sonarlint/
|
||||
|
||||
# SonarQube Plugin
|
||||
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
|
||||
.idea/**/sonarIssues.xml
|
||||
|
||||
# Markdown Navigator plugin
|
||||
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
|
||||
.idea/**/markdown-navigator.xml
|
||||
.idea/**/markdown-navigator-enh.xml
|
||||
.idea/**/markdown-navigator/
|
||||
|
||||
# Cache file creation bug
|
||||
# See https://youtrack.jetbrains.com/issue/JBR-2257
|
||||
.idea/$CACHE_FILE$
|
||||
|
||||
# CodeStream plugin
|
||||
# https://plugins.jetbrains.com/plugin/12206-codestream
|
||||
.idea/codestream.xml
|
||||
|
||||
# Azure Toolkit for IntelliJ plugin
|
||||
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
|
||||
.idea/**/azureSettings.xml
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### Python Patch ###
|
||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||
poetry.toml
|
||||
|
||||
# ruff
|
||||
.ruff_cache/
|
||||
|
||||
# LSP config files
|
||||
pyrightconfig.json
|
||||
|
||||
### SublimeText ###
|
||||
# Cache files for Sublime Text
|
||||
*.tmlanguage.cache
|
||||
*.tmPreferences.cache
|
||||
*.stTheme.cache
|
||||
|
||||
# Workspace files are user-specific
|
||||
*.sublime-workspace
|
||||
|
||||
# Project files should be checked into the repository, unless a significant
|
||||
# proportion of contributors will probably not be using Sublime Text
|
||||
# *.sublime-project
|
||||
|
||||
# SFTP configuration file
|
||||
sftp-config.json
|
||||
sftp-config-alt*.json
|
||||
|
||||
# Package control specific files
|
||||
Package Control.last-run
|
||||
Package Control.ca-list
|
||||
Package Control.ca-bundle
|
||||
Package Control.system-ca-bundle
|
||||
Package Control.cache/
|
||||
Package Control.ca-certs/
|
||||
Package Control.merged-ca-bundle
|
||||
Package Control.user-ca-bundle
|
||||
oscrypto-ca-bundle.crt
|
||||
bh_unicode_properties.cache
|
||||
|
||||
# Sublime-github package stores a github token in this file
|
||||
# https://packagecontrol.io/packages/sublime-github
|
||||
GitHub.sublime-settings
|
||||
|
||||
### Vim ###
|
||||
# Swap
|
||||
[._]*.s[a-v][a-z]
|
||||
!*.svg # comment out if you don't need vector files
|
||||
[._]*.sw[a-p]
|
||||
[._]s[a-rt-v][a-z]
|
||||
[._]ss[a-gi-z]
|
||||
[._]sw[a-p]
|
||||
|
||||
# Session
|
||||
Session.vim
|
||||
Sessionx.vim
|
||||
|
||||
# Temporary
|
||||
.netrwhist
|
||||
# Auto-generated tag files
|
||||
tags
|
||||
# Persistent undo
|
||||
[._]*.un~
|
||||
|
||||
### VisualStudioCode ###
|
||||
.vscode/*
|
||||
!.vscode/settings.json
|
||||
!.vscode/tasks.json
|
||||
!.vscode/launch.json
|
||||
!.vscode/extensions.json
|
||||
!.vscode/*.code-snippets
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
.history/
|
||||
|
||||
# Built Visual Studio Code Extensions
|
||||
*.vsix
|
||||
|
||||
### VisualStudioCode Patch ###
|
||||
# Ignore all local history of files
|
||||
.history
|
||||
.ionide
|
||||
|
||||
### VisualStudio ###
|
||||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
|
||||
|
||||
# User-specific files
|
||||
*.rsuser
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Mono auto generated files
|
||||
mono_crash.*
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
[Ww][Ii][Nn]32/
|
||||
[Aa][Rr][Mm]/
|
||||
[Aa][Rr][Mm]64/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
[Ll]ogs/
|
||||
|
||||
# Visual Studio 2015/2017 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# Visual Studio 2017 auto generated files
|
||||
Generated\ Files/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUnit
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
nunit-*.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# Benchmark Results
|
||||
BenchmarkDotNet.Artifacts/
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
|
||||
# ASP.NET Scaffolding
|
||||
ScaffoldingReadMe.txt
|
||||
|
||||
# StyleCop
|
||||
StyleCopReport.xml
|
||||
|
||||
# Files built by Visual Studio
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_h.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.iobj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.ipdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*_wpftmp.csproj
|
||||
*.tlog
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# Visual Studio Trace Files
|
||||
*.e2e
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# AxoCover is a Code Coverage Tool
|
||||
.axoCover/*
|
||||
!.axoCover/settings.json
|
||||
|
||||
# Coverlet is a free, cross platform Code Coverage Tool
|
||||
coverage*.json
|
||||
coverage*.xml
|
||||
coverage*.info
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# NuGet Symbol Packages
|
||||
*.snupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/[Pp]ackages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/[Pp]ackages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/[Pp]ackages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
*.appx
|
||||
*.appxbundle
|
||||
*.appxupload
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!?*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
ServiceFabricBackup/
|
||||
*.rptproj.bak
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
*.rptproj.rsuser
|
||||
*- [Bb]ackup.rdl
|
||||
*- [Bb]ackup ([0-9]).rdl
|
||||
*- [Bb]ackup ([0-9][0-9]).rdl
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio 6 auto-generated project file (contains which files were open etc.)
|
||||
*.vbp
|
||||
|
||||
# Visual Studio 6 workspace and project file (working project files containing files to include in project)
|
||||
*.dsw
|
||||
*.dsp
|
||||
|
||||
# Visual Studio 6 technical files
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# CodeRush personal settings
|
||||
.cr/personal
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Tabs Studio
|
||||
*.tss
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
||||
|
||||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
*.binlog
|
||||
|
||||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
||||
|
||||
# Local History for Visual Studio
|
||||
.localhistory/
|
||||
|
||||
# Visual Studio History (VSHistory) files
|
||||
.vshistory/
|
||||
|
||||
# BeatPulse healthcheck temp database
|
||||
healthchecksdb
|
||||
|
||||
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
||||
MigrationBackup/
|
||||
|
||||
# Ionide (cross platform F# VS Code tools) working folder
|
||||
.ionide/
|
||||
|
||||
# Fody - auto-generated XML schema
|
||||
FodyWeavers.xsd
|
||||
|
||||
# VS Code files for those working on multiple tools
|
||||
*.code-workspace
|
||||
|
||||
# Local History for Visual Studio Code
|
||||
|
||||
# Windows Installer files from build outputs
|
||||
*.cab
|
||||
*.msi
|
||||
*.msix
|
||||
*.msm
|
||||
*.msp
|
||||
|
||||
# JetBrains Rider
|
||||
*.sln.iml
|
||||
|
||||
### VisualStudio Patch ###
|
||||
# Additional files built by Visual Studio
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/vim,node,data,emacs,python,pycharm,executable,sublimetext,visualstudio,visualstudiocode
|
||||
|
||||
*.db
|
||||
50
README.md
Normal file
50
README.md
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
# RSS Link Audit (FastAPI)
|
||||
|
||||
A FastAPI app that accepts an RSS/Atom feed URL, fetches each post’s full HTML, extracts outbound links, groups them by hostname, **hunts for each host’s RSS feed** (common endpoints + homepage discovery), and renders a stylish report using the **Royal Armory** palette.
|
||||
|
||||
## Features
|
||||
|
||||
- Input a feed URL via UI or JSON.
|
||||
- Concurrent fetching (httpx + asyncio).
|
||||
- Extract links from each post page.
|
||||
- Group by hostname; count occurrences.
|
||||
- Heuristic RSS discovery:
|
||||
- Probe common feed endpoints (e.g. `/feed`, `/rss.xml`, `/atom.xml`, etc.).
|
||||
- Parse homepage `<link rel="alternate" ...>` for RSS/Atom.
|
||||
- Scan homepage `<a>` tags for `rss|atom|feed`.
|
||||
- Validate candidates with `feedparser`.
|
||||
- Report UI:
|
||||
- Per-host card with counts.
|
||||
- **Bar** visual for how many links a host has.
|
||||
- **Top links** (if mentioned > 1).
|
||||
- Links list truncated with a **More** button.
|
||||
- RSS/Atom badge if found.
|
||||
|
||||
## Run locally
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
||||
pip install -r requirements.txt
|
||||
uvicorn main:app --reload
|
||||
```
|
||||
|
||||
Open: http://127.0.0.1:8000
|
||||
|
||||
## API
|
||||
|
||||
```
|
||||
POST /api/analyze
|
||||
Content-Type: application/json
|
||||
|
||||
{"feed_url": "https://example.com/feed.xml"}
|
||||
```
|
||||
|
||||
Returns JSON with the summarized data.
|
||||
|
||||
## Notes / Caveats
|
||||
|
||||
- Only static HTML is parsed (no JS rendering).
|
||||
- Some sites block bots; results may vary.
|
||||
- For large feeds, you may wish to trim the number of posts (e.g., slice `post_urls` in `analyze_feed`).
|
||||
- Consider adding caching (e.g., `aiocache`, Redis) if you’ll run this frequently.
|
||||
537
main.py
Normal file
537
main.py
Normal file
|
|
@ -0,0 +1,537 @@
|
|||
# main.py (v1.2) — robust feed parsing, clearer SSE progress, normalized host caching, concurrent discovery
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
import httpx
|
||||
import feedparser
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from fastapi import FastAPI, Request, Form, HTTPException
|
||||
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from sqlmodel import Field, SQLModel, create_engine, Session, select
|
||||
from datetime import datetime, timezone
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Settings / Constants
|
||||
# ------------------------------
|
||||
REQUEST_TIMEOUT = httpx.Timeout(15.0, connect=8.0, read=15.0)
|
||||
HEADERS = {
|
||||
"User-Agent": "LinkAuditBot/1.2 (+https://example.com; contact: admin@example.com)"
|
||||
}
|
||||
|
||||
COMMON_FEED_PATHS = [
|
||||
"/feed", "/feed/", "/feed.xml",
|
||||
"/rss", "/rss.xml", "/rss/",
|
||||
"/atom", "/atom.xml",
|
||||
"/index.xml",
|
||||
"/blog/feed", "/blog/rss", "/blog/rss.xml", "/blog/index.xml",
|
||||
"/feeds/posts/default?alt=rss", # Blogger
|
||||
"/news/atom.xml", "/news/rss.xml",
|
||||
"/.rss", "/?feed=rss2", # WP variants
|
||||
"/category/news/feed", "/?feed=atom",
|
||||
]
|
||||
|
||||
FEED_MIME_HINTS = {
|
||||
"application/rss+xml",
|
||||
"application/atom+xml",
|
||||
"application/xml",
|
||||
"text/xml",
|
||||
}
|
||||
|
||||
DISCOVERY_CONCURRENCY = 10
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Database Models (SQLModel)
|
||||
# ------------------------------
|
||||
class PageCache(SQLModel, table=True):
|
||||
url: str = Field(primary_key=True)
|
||||
html: Optional[str] = None
|
||||
fetched_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
class LinksCache(SQLModel, table=True):
|
||||
url: str = Field(primary_key=True)
|
||||
links_json: str # JSON list[str]
|
||||
extracted_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
class HostFeedCache(SQLModel, table=True):
|
||||
hostname: str = Field(primary_key=True) # normalized!
|
||||
feed_url: Optional[str] = None
|
||||
checked_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
|
||||
class FeedRun(SQLModel, table=True):
|
||||
id: str = Field(primary_key=True, default_factory=lambda: str(uuid.uuid4()))
|
||||
feed_url: str
|
||||
started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
|
||||
finished_at: Optional[datetime] = None
|
||||
summary_json: Optional[str] = None # store last summary, if desired
|
||||
|
||||
|
||||
engine = create_engine("sqlite:///cache.db", echo=False)
|
||||
SQLModel.metadata.create_all(engine)
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Data models
|
||||
# ------------------------------
|
||||
@dataclass
|
||||
class HostSummary:
|
||||
hostname: str
|
||||
count: int = 0
|
||||
unique_links: Set[str] = field(default_factory=set)
|
||||
link_counts: Counter = field(default_factory=Counter)
|
||||
feed_url: Optional[str] = None
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Utilities
|
||||
# ------------------------------
|
||||
def now_utc() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
def normalize_host(host: str) -> str:
|
||||
if not host:
|
||||
return host
|
||||
h = host.strip().lower().rstrip(".")
|
||||
if h.startswith("www."):
|
||||
h = h[4:]
|
||||
return h
|
||||
|
||||
def is_http_url(href: str) -> bool:
|
||||
try:
|
||||
p = urlparse(href)
|
||||
return p.scheme in ("http", "https")
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def absolutize(href: str, base_url: str) -> Optional[str]:
|
||||
if not href:
|
||||
return None
|
||||
if href.startswith("#") or href.startswith("mailto:") or href.startswith("tel:"):
|
||||
return None
|
||||
try:
|
||||
abs_url = urljoin(base_url, href)
|
||||
if is_http_url(abs_url):
|
||||
return abs_url
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
def extract_links_from_html(html: str, base_url: str) -> List[str]:
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
links: List[str] = []
|
||||
for a in soup.find_all("a", href=True):
|
||||
u = absolutize(a.get("href"), base_url)
|
||||
if u:
|
||||
links.append(u)
|
||||
return links
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Networking
|
||||
# ------------------------------
|
||||
async def fetch_text(client: httpx.AsyncClient, url: str) -> Optional[str]:
|
||||
try:
|
||||
r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True)
|
||||
if r.status_code < 400:
|
||||
# Do NOT force UTF-8; respect server if present
|
||||
if r.encoding is None:
|
||||
r.encoding = r.apparent_encoding or "utf-8"
|
||||
return r.text
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
async def fetch_bytes(client: httpx.AsyncClient, url: str) -> Optional[Tuple[bytes, Optional[str]]]:
|
||||
try:
|
||||
r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True)
|
||||
if r.status_code < 400:
|
||||
ctype = r.headers.get("content-type")
|
||||
return r.content, ctype
|
||||
except Exception:
|
||||
return None
|
||||
return None
|
||||
|
||||
async def fetch_head_ok(client: httpx.AsyncClient, url: str) -> Tuple[bool, Optional[str]]:
|
||||
try:
|
||||
r = await client.head(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True)
|
||||
if r.status_code < 400:
|
||||
return True, r.headers.get("content-type")
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
r = await client.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT, follow_redirects=True)
|
||||
if r.status_code < 400:
|
||||
return True, r.headers.get("content-type")
|
||||
except Exception:
|
||||
pass
|
||||
return False, None
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Cache helpers
|
||||
# ------------------------------
|
||||
def cache_get_page(url: str) -> Optional[PageCache]:
|
||||
with Session(engine) as sess:
|
||||
return sess.get(PageCache, url)
|
||||
|
||||
def cache_set_page(url: str, html: Optional[str]):
|
||||
with Session(engine) as sess:
|
||||
sess.merge(PageCache(url=url, html=html, fetched_at=now_utc()))
|
||||
sess.commit()
|
||||
|
||||
def cache_get_links(url: str) -> Optional[List[str]]:
|
||||
with Session(engine) as sess:
|
||||
row = sess.get(LinksCache, url)
|
||||
if not row:
|
||||
return None
|
||||
try:
|
||||
return json.loads(row.links_json)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def cache_set_links(url: str, links: List[str]):
|
||||
with Session(engine) as sess:
|
||||
sess.merge(LinksCache(url=url, links_json=json.dumps(links), extracted_at=now_utc()))
|
||||
sess.commit()
|
||||
|
||||
def cache_get_host_feed(hostname: str) -> Optional[str]:
|
||||
host_key = normalize_host(hostname)
|
||||
with Session(engine) as sess:
|
||||
row = sess.get(HostFeedCache, host_key)
|
||||
return row.feed_url if row else None
|
||||
|
||||
def cache_set_host_feed(hostname: str, feed_url: Optional[str]):
|
||||
host_key = normalize_host(hostname)
|
||||
with Session(engine) as sess:
|
||||
sess.merge(HostFeedCache(hostname=host_key, feed_url=feed_url, checked_at=now_utc()))
|
||||
sess.commit()
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Cached fetch/extract
|
||||
# ------------------------------
|
||||
async def fetch_page_html(client: httpx.AsyncClient, url: str) -> Optional[str]:
|
||||
cached = cache_get_page(url)
|
||||
if cached and cached.html:
|
||||
return cached.html
|
||||
html = await fetch_text(client, url)
|
||||
cache_set_page(url, html)
|
||||
return html
|
||||
|
||||
async def get_links_for_page(client: httpx.AsyncClient, url: str) -> List[str]:
|
||||
cached = cache_get_links(url)
|
||||
if cached is not None:
|
||||
return cached
|
||||
html = await fetch_page_html(client, url)
|
||||
if not html:
|
||||
cache_set_links(url, [])
|
||||
return []
|
||||
links = extract_links_from_html(html, url)
|
||||
cache_set_links(url, links)
|
||||
return links
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Robust feed parsing
|
||||
# ------------------------------
|
||||
async def fetch_feed_entries(client: httpx.AsyncClient, feed_url: str) -> List[str]:
|
||||
"""
|
||||
Fetch feed as bytes and let feedparser infer encoding using headers.
|
||||
Retry a couple fallbacks for mismatched declarations.
|
||||
"""
|
||||
got = await fetch_bytes(client, feed_url)
|
||||
if not got:
|
||||
raise ValueError("Could not download the feed.")
|
||||
content, ctype = got
|
||||
|
||||
parsed = feedparser.parse(content)
|
||||
if parsed.bozo == 0 and (parsed.feed or parsed.entries):
|
||||
return _entries_to_urls(parsed)
|
||||
# Fallback 1: strip BOM
|
||||
cleaned = content.lstrip(b"\xef\xbb\xbf")
|
||||
if cleaned is not content:
|
||||
parsed2 = feedparser.parse(cleaned)
|
||||
if parsed2.bozo == 0 and (parsed2.feed or parsed2.entries):
|
||||
return _entries_to_urls(parsed2)
|
||||
# Fallback 2: replace us-ascii decl with utf-8
|
||||
try:
|
||||
cleaned2 = cleaned.replace(b'encoding="us-ascii"', b'encoding="utf-8"')
|
||||
parsed3 = feedparser.parse(cleaned2)
|
||||
if parsed3.bozo == 0 and (parsed3.feed or parsed3.entries):
|
||||
return _entries_to_urls(parsed3)
|
||||
except Exception:
|
||||
pass
|
||||
raise ValueError(f"Could not parse feed: {getattr(parsed, 'bozo_exception', 'unknown parse error')}")
|
||||
|
||||
def _entries_to_urls(parsed) -> List[str]:
|
||||
urls: List[str] = []
|
||||
for e in parsed.entries:
|
||||
if getattr(e, "link", None):
|
||||
urls.append(e.link)
|
||||
elif getattr(e, "id", None) and is_http_url(e.id):
|
||||
urls.append(e.id)
|
||||
seen, out = set(), []
|
||||
for u in urls:
|
||||
if u not in seen:
|
||||
seen.add(u)
|
||||
out.append(u)
|
||||
return out
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Feed discovery (normalized + concurrent)
|
||||
# ------------------------------
|
||||
async def discover_feed_for_host(client: httpx.AsyncClient, hostname: str) -> Optional[str]:
|
||||
host_key = normalize_host(hostname)
|
||||
cached = cache_get_host_feed(host_key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
bases = []
|
||||
canon = host_key
|
||||
bases.append(f"https://{canon}")
|
||||
bases.append(f"http://{canon}")
|
||||
if not canon.startswith("www."):
|
||||
bases.append(f"https://www.{canon}")
|
||||
bases.append(f"http://www.{canon}")
|
||||
|
||||
async def try_candidate(url: str) -> Optional[str]:
|
||||
ok, ctype = await fetch_head_ok(client, url)
|
||||
if ok and (not ctype or any(mt in ctype for mt in FEED_MIME_HINTS)):
|
||||
parsed = feedparser.parse(url)
|
||||
if parsed.bozo == 0 and (parsed.feed or parsed.entries):
|
||||
return url
|
||||
return None
|
||||
|
||||
tasks = []
|
||||
for base in bases:
|
||||
for path in COMMON_FEED_PATHS:
|
||||
tasks.append(asyncio.create_task(try_candidate(base + path)))
|
||||
for t in asyncio.as_completed(tasks):
|
||||
res = await t
|
||||
if res:
|
||||
cache_set_host_feed(host_key, res)
|
||||
return res
|
||||
|
||||
for base in bases:
|
||||
html = await fetch_page_html(client, base + "/")
|
||||
if not html:
|
||||
continue
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
for link in soup.find_all("link", rel=True, href=True):
|
||||
rels = link.get("rel")
|
||||
if isinstance(rels, list):
|
||||
rels = {r.lower() for r in rels if r}
|
||||
else:
|
||||
rels = {str(rels).lower()}
|
||||
typ = str(link.get("type", "")).lower()
|
||||
href = link.get("href")
|
||||
if "alternate" in rels and any(mt in typ for mt in ("rss", "atom", "xml")):
|
||||
feed_url = urljoin(base + "/", href)
|
||||
parsed = feedparser.parse(feed_url)
|
||||
if parsed.bozo == 0 and (parsed.feed or parsed.entries):
|
||||
cache_set_host_feed(host_key, feed_url)
|
||||
return feed_url
|
||||
for a in soup.find_all("a", href=True):
|
||||
href = a.get("href", "")
|
||||
if any(tok in href.lower() for tok in ("rss", "atom", "feed")):
|
||||
feed_url = urljoin(base + "/", href)
|
||||
ok, ctype = await fetch_head_ok(client, feed_url)
|
||||
if ok:
|
||||
parsed = feedparser.parse(feed_url)
|
||||
if parsed.bozo == 0 and (parsed.feed or parsed.entries):
|
||||
cache_set_host_feed(host_key, feed_url)
|
||||
return feed_url
|
||||
|
||||
cache_set_host_feed(host_key, None)
|
||||
return None
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# SSE plumbing
|
||||
# ------------------------------
|
||||
class Job:
|
||||
def __init__(self, feed_url: str):
|
||||
self.id = str(uuid.uuid4())
|
||||
self.feed_url = feed_url
|
||||
self.queue: asyncio.Queue[str] = asyncio.Queue()
|
||||
self.done = asyncio.Event()
|
||||
|
||||
async def emit(self, event: str, data: dict):
|
||||
payload = {"event": event, "data": data, "ts": datetime.now(timezone.utc).isoformat()}
|
||||
await self.queue.put(f"event: {event}\ndata: {json.dumps(payload)}\n\n")
|
||||
|
||||
async def finish(self):
|
||||
self.done.set()
|
||||
await self.queue.put("event: done\ndata: {}\n\n")
|
||||
|
||||
|
||||
JOBS: Dict[str, Job] = {}
|
||||
|
||||
|
||||
async def run_analysis_job(job: Job):
|
||||
with Session(engine) as sess:
|
||||
fr = FeedRun(feed_url=job.feed_url)
|
||||
sess.add(fr)
|
||||
sess.commit()
|
||||
|
||||
async with httpx.AsyncClient(http2=True) as client:
|
||||
try:
|
||||
await job.emit("status", {"stage": "feed", "message": "Downloading and parsing feed…"})
|
||||
post_urls = await fetch_feed_entries(client, job.feed_url)
|
||||
await job.emit("posts", {"count": len(post_urls)})
|
||||
|
||||
all_links: List[str] = []
|
||||
for idx, post_url in enumerate(post_urls, start=1):
|
||||
await job.emit("status", {"stage": "posts", "message": f"Fetching post {idx}/{len(post_urls)}"})
|
||||
links = await get_links_for_page(client, post_url)
|
||||
all_links.extend(links)
|
||||
await job.emit("post_progress", {"current": idx, "total": len(post_urls), "post_url": post_url})
|
||||
|
||||
host_map: Dict[str, HostSummary] = {}
|
||||
for link in all_links:
|
||||
host = normalize_host(urlparse(link).netloc)
|
||||
if not host:
|
||||
continue
|
||||
hs = host_map.setdefault(host, HostSummary(hostname=host))
|
||||
hs.count += 1
|
||||
hs.unique_links.add(link)
|
||||
hs.link_counts[link] += 1
|
||||
|
||||
hosts_sorted = sorted(host_map.values(), key=lambda s: s.count, reverse=True)
|
||||
await job.emit("hosts", {"count": len(hosts_sorted)})
|
||||
|
||||
sem = asyncio.Semaphore(DISCOVERY_CONCURRENCY)
|
||||
max_count = max((h.count for h in hosts_sorted), default=1)
|
||||
|
||||
async def work(hs: HostSummary, idx: int, total: int):
|
||||
async with sem:
|
||||
await job.emit("status", {"stage": "discover", "message": f"Discovering feed for {hs.hostname} ({idx}/{total})"})
|
||||
feed = await discover_feed_for_host(client, hs.hostname)
|
||||
hs.feed_url = feed
|
||||
host_dict = {
|
||||
"hostname": hs.hostname,
|
||||
"count": hs.count,
|
||||
"unique_link_count": len(hs.unique_links),
|
||||
"links": sorted(list(hs.unique_links)),
|
||||
"top_links": [
|
||||
{"url": url, "count": cnt}
|
||||
for url, cnt in hs.link_counts.most_common()
|
||||
if cnt > 1
|
||||
],
|
||||
"feed_url": hs.feed_url,
|
||||
}
|
||||
html = render_host_card(host_dict, max_count, index=idx)
|
||||
await job.emit("host_card", {"html": html, "index": idx, "total": total})
|
||||
|
||||
tasks = [asyncio.create_task(work(hs, i, len(hosts_sorted))) for i, hs in enumerate(hosts_sorted, start=1)]
|
||||
async def heartbeat():
|
||||
while any(not t.done() for t in tasks):
|
||||
await job.emit("status", {"stage": "discover", "message": "Still discovering host feeds…"})
|
||||
await asyncio.sleep(3)
|
||||
hb = asyncio.create_task(heartbeat())
|
||||
await asyncio.gather(*tasks)
|
||||
hb.cancel()
|
||||
|
||||
summary = {
|
||||
"feed_url": job.feed_url,
|
||||
"post_count": len(post_urls),
|
||||
"hosts": [h.hostname for h in hosts_sorted],
|
||||
"fetched_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
with Session(engine) as sess:
|
||||
fr = sess.exec(select(FeedRun).where(FeedRun.feed_url == job.feed_url).order_by(FeedRun.started_at.desc())).first()
|
||||
if fr:
|
||||
fr.summary_json = json.dumps(summary)
|
||||
fr.finished_at = datetime.now(timezone.utc)
|
||||
sess.add(fr)
|
||||
sess.commit()
|
||||
|
||||
await job.emit("summary", summary)
|
||||
except Exception as e:
|
||||
await job.emit("error", {"message": str(e)})
|
||||
finally:
|
||||
await job.finish()
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# Template rendering for components
|
||||
# ------------------------------
|
||||
templates = Jinja2Templates(directory="templates")
|
||||
|
||||
def render_host_card(host: dict, max_count: int, index: int) -> str:
|
||||
from fastapi import Request
|
||||
class Dummy:
|
||||
def __init__(self): self.state = type("s", (), {})()
|
||||
req = Dummy()
|
||||
html = templates.get_template("components/host_card.html").render(
|
||||
request=req, host=host, max_count=max_count, index=index
|
||||
)
|
||||
return html
|
||||
|
||||
|
||||
# ------------------------------
|
||||
# FastAPI app + routes
|
||||
# ------------------------------
|
||||
app = FastAPI(title="RSS Link Audit", version="1.2.0")
|
||||
app.mount("/static", StaticFiles(directory="static"), name="static")
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def index(request: Request):
|
||||
return templates.TemplateResponse("index.html", {"request": request})
|
||||
|
||||
|
||||
@app.post("/start", response_class=JSONResponse)
|
||||
async def start(feed_url: str = Form(...)):
|
||||
job = Job(feed_url)
|
||||
JOBS[job.id] = job
|
||||
asyncio.create_task(run_analysis_job(job))
|
||||
return {"job_id": job.id}
|
||||
|
||||
|
||||
@app.get("/events/{job_id}")
|
||||
async def sse(job_id: str):
|
||||
job = JOBS.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(404, "Job not found")
|
||||
|
||||
async def event_gen():
|
||||
yield f"event: hello\ndata: {{\"job_id\":\"{job.id}\"}}\n\n"
|
||||
while True:
|
||||
try:
|
||||
item = await asyncio.wait_for(job.queue.get(), timeout=30.0)
|
||||
yield item
|
||||
if job.done.is_set():
|
||||
break
|
||||
except asyncio.TimeoutError:
|
||||
yield "event: ping\ndata: {}\n\n"
|
||||
JOBS.pop(job.id, None)
|
||||
|
||||
return StreamingResponse(event_gen(), media_type="text/event-stream")
|
||||
|
||||
|
||||
@app.post("/api/analyze", response_class=JSONResponse)
|
||||
async def analyze_api(payload: Dict):
|
||||
feed_url = payload.get("feed_url")
|
||||
if not feed_url:
|
||||
raise HTTPException(status_code=400, detail="Missing 'feed_url'")
|
||||
job = Job(feed_url)
|
||||
await run_analysis_job(job)
|
||||
return JSONResponse(content={"ok": True})
|
||||
|
||||
|
||||
@app.get("/healthz")
|
||||
async def healthz():
|
||||
return {"ok": True}
|
||||
10
requirements.txt
Normal file
10
requirements.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
fastapi
|
||||
uvicorn[standard]
|
||||
httpx
|
||||
feedparser
|
||||
beautifulsoup4
|
||||
lxml
|
||||
jinja2
|
||||
sqlmodel
|
||||
aiosqlite
|
||||
sqlalchemy>=2.0
|
||||
84
static/styles.css
Normal file
84
static/styles.css
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
/* Royal Armory Palette */
|
||||
:root {
|
||||
--ra-ink: #000030;
|
||||
--ra-plum: #3f0a57;
|
||||
--ra-magenta: #85106b;
|
||||
--ra-ruby: #b02c2c;
|
||||
--ra-bronze: #b8673e;
|
||||
--ra-amber: #d9932b;
|
||||
--ra-gold: #f0bd71;
|
||||
--ra-cream: #ffe3ba;
|
||||
|
||||
--ra-bg: var(--ra-ink);
|
||||
--ra-panel: #0b0b3f;
|
||||
--ra-copper: #6f3b2b;
|
||||
--ra-ruby-dark: #8c2323;
|
||||
}
|
||||
|
||||
* { box-sizing: border-box; }
|
||||
html, body {
|
||||
margin: 0;
|
||||
background: var(--ra-bg);
|
||||
color: var(--ra-cream);
|
||||
font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Ubuntu, Cantarell, Noto Sans, Arial;
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
a { color: var(--ra-gold); text-decoration: none; }
|
||||
a:hover { text-decoration: underline; }
|
||||
|
||||
header, footer { background: linear-gradient(0deg, rgba(64,10,87,0.25), rgba(64,10,87,0.25)); }
|
||||
main { padding: 1rem; }
|
||||
|
||||
.link { word-break: break-all; text-underline-offset: 3px; }
|
||||
|
||||
.bar-wrap { width: 100%; background: rgba(240,189,113,0.12); height: 12px; }
|
||||
.bar { height: 12px; background: linear-gradient(90deg, var(--ra-amber), var(--ra-gold)); }
|
||||
|
||||
.btn-more { background: var(--ra-plum); padding: 6px 10px; border-radius: 10px; font-weight: 600; color: var(--ra-cream); }
|
||||
|
||||
.more-list[data-expanded="false"] { display: none; }
|
||||
.more-list[data-expanded="true"] { display: block; }
|
||||
|
||||
/* utilities */
|
||||
.max-w-3xl { max-width: 48rem; }
|
||||
.max-w-5xl { max-width: 64rem; }
|
||||
.mx-auto { margin-left: auto; margin-right: auto; }
|
||||
.p-6 { padding: 1.5rem; }
|
||||
.p-5 { padding: 1.25rem; }
|
||||
.p-4 { padding: 1rem; }
|
||||
.px-6 { padding-left: 1.5rem; padding-right: 1.5rem; }
|
||||
.py-4 { padding-top: 1rem; padding-bottom: 1rem; }
|
||||
.py-10{ padding-top: 2.5rem; padding-bottom: 2.5rem; }
|
||||
.mt-2 { margin-top: 0.5rem; }
|
||||
.mt-4 { margin-top: 1rem; }
|
||||
.mt-6 { margin-top: 1.5rem; }
|
||||
.mb-1 { margin-bottom: 0.25rem; }
|
||||
.mb-2 { margin-bottom: 0.5rem; }
|
||||
.space-y-1 > * + * { margin-top: 0.25rem; }
|
||||
.space-y-6 > * + * { margin-top: 1.5rem; }
|
||||
.rounded-2xl { border-radius: 1rem; }
|
||||
.rounded-xl { border-radius: 0.75rem; }
|
||||
.shadow { box-shadow: 0 10px 30px rgba(0,0,0,0.25); }
|
||||
.font-bold { font-weight: 700; }
|
||||
.font-semibold { font-weight: 600; }
|
||||
.text-sm { font-size: 0.875rem; }
|
||||
.text-xl { font-size: 1.25rem; }
|
||||
.text-3xl { font-size: 1.875rem; }
|
||||
.opacity-70 { opacity: 0.7; }
|
||||
.opacity-80 { opacity: 0.8; }
|
||||
.border { border-width: 1px; }
|
||||
.border-b { border-bottom-width: 1px; }
|
||||
.flex { display: flex; }
|
||||
.items-center { align-items: center; }
|
||||
.items-baseline { align-items: baseline; }
|
||||
.justify-between { justify-content: space-between; }
|
||||
.gap-2 { gap: 0.5rem; }
|
||||
.gap-4 { gap: 1rem; }
|
||||
.min-w-0 { min-width: 0; }
|
||||
.shrink-0 { flex-shrink: 0; }
|
||||
.break-all { word-break: break-all; }
|
||||
.w-3 { width: 0.75rem; }
|
||||
.h-3 { height: 0.75rem; }
|
||||
input, button { border: none; }
|
||||
button { cursor: pointer; }
|
||||
62
templates/components/host_card.html
Normal file
62
templates/components/host_card.html
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
{% set pct = (100 * host.count // (max_count or 1)) %}
|
||||
<article class="rounded-2xl bg-[var(--ra-panel)] border border-[var(--ra-copper)] overflow-hidden">
|
||||
<header class="p-4 flex items-center justify-between gap-4">
|
||||
<div class="min-w-0">
|
||||
<h2 class="text-xl font-semibold break-all">{{ host.hostname }}</h2>
|
||||
<div class="text-sm opacity-80">
|
||||
<span class="mr-3">Links: <strong>{{ host.count }}</strong></span>
|
||||
<span>Unique: <strong>{{ host.unique_link_count }}</strong></span>
|
||||
</div>
|
||||
</div>
|
||||
{% if host.feed_url %}
|
||||
<a href="{{ host.feed_url }}" target="_blank" rel="noopener"
|
||||
class="shrink-0 px-3 py-1 rounded-lg bg-[var(--ra-amber)] text-[var(--ra-ink)] font-semibold hover:opacity-90">
|
||||
RSS / Atom
|
||||
</a>
|
||||
{% endif %}
|
||||
</header>
|
||||
|
||||
<div class="bar-wrap">
|
||||
<div class="bar" style="width: {{ pct }}%"></div>
|
||||
</div>
|
||||
|
||||
<div class="p-4 space-y-4">
|
||||
{% if host.top_links %}
|
||||
<div>
|
||||
<div class="text-sm font-semibold mb-2">Top links (mentioned > 1):</div>
|
||||
<ul class="space-y-1 text-sm">
|
||||
{% for tl in host.top_links %}
|
||||
<li class="flex items-baseline gap-2">
|
||||
<span class="inline-block px-2 py-0.5 rounded-md bg-[var(--ra-ruby)]">{{ tl.count }}</span>
|
||||
<a class="link" href="{{ tl.url }}" target="_blank" rel="noopener">{{ tl.url }}</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
{% set list_id = "links-" ~ index %}
|
||||
{% set links = host.links %}
|
||||
{% set preview = links[:8] %}
|
||||
{% set remainder = links[8:] %}
|
||||
<div>
|
||||
<div class="text-sm font-semibold mb-2">Links:</div>
|
||||
<ul class="space-y-1 text-sm">
|
||||
{% for url in preview %}
|
||||
<li><a class="link" href="{{ url }}" target="_blank" rel="noopener">{{ url }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
{% if remainder %}
|
||||
<div id="{{ list_id }}" class="more-list" data-expanded="false">
|
||||
<ul class="space-y-1 text-sm">
|
||||
{% for url in remainder %}
|
||||
<li><a class="link" href="{{ url }}" target="_blank" rel="noopener">{{ url }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
<button class="btn-more mt-2" data-more-btn data-target="{{ list_id }}">More</button>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</article>
|
||||
104
templates/index.html
Normal file
104
templates/index.html
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
{% extends "layout.html" %}
|
||||
{% block content %}
|
||||
<section class="mx-auto max-w-3xl p-6">
|
||||
<h1 class="text-3xl font-bold mb-2">RSS Link Audit</h1>
|
||||
<p class="mb-6 opacity-90">Paste a feed URL. This version uses <strong>SQLite/SQLModel caching</strong> and streams progress over <strong>SSE</strong>.</p>
|
||||
|
||||
<form id="feed-form" class="space-y-4 bg-[var(--ra-panel)] p-5 rounded-2xl shadow">
|
||||
<label class="block">
|
||||
<span class="block mb-2 font-semibold">Feed URL</span>
|
||||
<input id="feed-input" type="url" name="feed_url" placeholder="https://example.com/feed.xml"
|
||||
required
|
||||
class="w-full p-3 rounded-xl bg-[var(--ra-ink)] text-[var(--ra-cream)] border border-[var(--ra-copper)] focus:outline-none focus:ring-2 focus:ring-[var(--ra-amber)]" />
|
||||
</label>
|
||||
<button class="px-4 py-2 rounded-xl font-semibold bg-[var(--ra-ruby)] hover:bg-[var(--ra-ruby-dark)]">
|
||||
Analyze
|
||||
</button>
|
||||
</form>
|
||||
|
||||
<div id="status" class="mt-6 text-sm opacity-80"></div>
|
||||
|
||||
<section id="summary" class="mt-6"></section>
|
||||
<section id="hosts" class="mt-4 space-y-6"></section>
|
||||
</section>
|
||||
|
||||
<script>
|
||||
const statusEl = document.getElementById('status');
|
||||
const hostsEl = document.getElementById('hosts');
|
||||
const summaryEl = document.getElementById('summary');
|
||||
const form = document.getElementById('feed-form');
|
||||
|
||||
function setStatus(html) { statusEl.innerHTML = html; }
|
||||
function appendHostCard(html) {
|
||||
const div = document.createElement('div');
|
||||
div.innerHTML = html;
|
||||
hostsEl.appendChild(div.firstElementChild);
|
||||
}
|
||||
function setSummary(feed_url, post_count, host_count) {
|
||||
summaryEl.innerHTML = `
|
||||
<div class="rounded-2xl bg-[var(--ra-panel)] border border-[var(--ra-copper)] p-4">
|
||||
<div class="font-semibold mb-1">Summary</div>
|
||||
<div>Feed: <a class="underline" href="${feed_url}" target="_blank" rel="noopener">${feed_url}</a></div>
|
||||
<div>Posts parsed: <strong>${post_count}</strong></div>
|
||||
<div>Hosts found: <strong>${host_count}</strong></div>
|
||||
</div>`;
|
||||
}
|
||||
|
||||
form.addEventListener('submit', async (e) => {
|
||||
e.preventDefault();
|
||||
hostsEl.innerHTML = '';
|
||||
summaryEl.innerHTML = '';
|
||||
setStatus('Starting…');
|
||||
|
||||
const fd = new FormData(form);
|
||||
const resp = await fetch('/start', { method: 'POST', body: fd });
|
||||
if (!resp.ok) {
|
||||
setStatus('Failed to start.');
|
||||
return;
|
||||
}
|
||||
const { job_id } = await resp.json();
|
||||
setStatus('Job started. Connecting…');
|
||||
|
||||
const es = new EventSource(`/events/${job_id}`);
|
||||
let postCount = 0, hostsCount = 0, seenCards = 0;
|
||||
|
||||
es.addEventListener('hello', () => setStatus('Connected. Parsing feed…'));
|
||||
es.addEventListener('status', (ev) => {
|
||||
const d = JSON.parse(ev.data).data;
|
||||
setStatus(`${d.message}`);
|
||||
});
|
||||
es.addEventListener('posts', (ev) => {
|
||||
const data = JSON.parse(ev.data).data;
|
||||
postCount = data.count || 0;
|
||||
setStatus(`Posts: ${postCount}. Fetching pages…`);
|
||||
});
|
||||
es.addEventListener('post_progress', (ev) => {
|
||||
const d = JSON.parse(ev.data).data;
|
||||
setStatus(`Fetching posts ${d.current}/${d.total}…`);
|
||||
});
|
||||
es.addEventListener('hosts', (ev) => {
|
||||
const data = JSON.parse(ev.data).data;
|
||||
hostsCount = data.count || 0;
|
||||
setStatus(`Found ${hostsCount} hosts. Discovering their feeds…`);
|
||||
});
|
||||
es.addEventListener('host_card', (ev) => {
|
||||
const data = JSON.parse(ev.data).data;
|
||||
appendHostCard(data.html);
|
||||
seenCards = data.index;
|
||||
setStatus(`Rendered ${seenCards}/${data.total} hosts… Still discovering feeds…`);
|
||||
});
|
||||
es.addEventListener('summary', (ev) => {
|
||||
const data = JSON.parse(ev.data).data;
|
||||
setSummary(data.feed_url, postCount, hostsCount);
|
||||
});
|
||||
es.addEventListener('error', (ev) => {
|
||||
const data = JSON.parse(ev.data).data;
|
||||
setStatus('Error: ' + (data.message || 'Unknown'));
|
||||
});
|
||||
es.addEventListener('done', () => {
|
||||
setStatus('Done.');
|
||||
es.close();
|
||||
});
|
||||
});
|
||||
</script>
|
||||
{% endblock %}
|
||||
24
templates/layout.html
Normal file
24
templates/layout.html
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<title>RSS Link Audit</title>
|
||||
<link rel="stylesheet" href="/static/styles.css"/>
|
||||
</head>
|
||||
<body>
|
||||
<header class="px-6 py-4 border-b border-[var(--ra-copper)]">
|
||||
<div class="max-w-5xl mx-auto flex items-center gap-4">
|
||||
<div class="w-3 h-3 rounded-full bg-[var(--ra-gold)]"></div>
|
||||
<a href="/" class="font-bold hover:underline">RSS Link Audit</a>
|
||||
<span class="opacity-70 text-sm">with SQLite cache + SSE</span>
|
||||
</div>
|
||||
</header>
|
||||
<main class="max-w-5xl mx-auto">
|
||||
{% block content %}{% endblock %}
|
||||
</main>
|
||||
<footer class="px-6 py-10 text-sm opacity-70">
|
||||
<div class="max-w-5xl mx-auto">Built with FastAPI • Palette: Royal Armory</div>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
Loading…
Add table
Add a link
Reference in a new issue