Below is a script, written in Python, that will find RSS links from any given URL. First it'll check for any <link>
tags with rel="alternate"
and then it'll check for any <a>
tags. If it finds any, it'll parse them and return the results. If it still hasn't found anything, then the script will attempt common routes.
I use this to rapidly find RSS feeds on the fly from the command line. If the script matches any links, it'll copy the first URL response to your clipboard.
There's additional, but optional logging and error handling in the script.
Header
You'll need to install the following packages:
pip install requests feedparser pyperclip bs4
After that, add the following to the top of your script:
import time import pyperclip import requests import feedparser import logging import urllib.parse from bs4 import BeautifulSoup # Request the URL with an input prompt url = input("Enter the URL: ") timer = time.time()
Output
This is a simple function that will output the message to the console. It's used to clear the line and overwrite it with spaces, and to move the cursor back to the beginning of the line.
# Output def update_output(message, clear=True): if len(message) > 80: message = message[:80] if clear: # Clear the line by overwriting with spaces sys.stdout.write('\r' + ' ' * 80) sys.stdout.write('\r') # Move the cursor back to the beginning sys.stdout.write(message) # Write the new message on the same line else: sys.stdout.write('\n' + message) # Write the message on a new line sys.stdout.flush() # Force the writing to flush
Primary Function
# Find RSS Links on Page def find_rss_links(url): print('Searching for RSS Links') update_output(f"[ ] {url[:80]} | STARTING", False) if not url.startswith("http"): url = "https://" + url update_output(f"[ ] {url[:40]} | ADDED `https://` TO URL") try: update_output(f"[ ] {url[:40]} | FETCHING URL") raw = requests.get(url, timeout=3).text html = BeautifulSoup(raw, "html.parser") except requests.RequestException as e: logging.error(f"Error fetching {url}: {e}") update_output(f"[!] {url[:40]} | [ERROR] FETCHING") return convert_to_json([]) # Extract RSS feed links from <link> tags with rel="alternate" feed_urls = html.findAll("link", rel="alternate") result = set() update_output(f"[ ] {url[:40]} | FOUND {len(feed_urls)} <link> TAGS") for f in feed_urls: feed_type = f.get("type") if feed_type and ("rss" in feed_type or "xml" in feed_type): href = f.get("href") if href: result.add(resolve_url(url, href)) update_output(f"[-] {url[:40]} | [MATCHED] <link> TAG") # Extract RSS feed links from <a> tags if not result: update_output(f"[ ] {url[:40]} | NO LINKS FOUND, CHECKING <a> TAGS") base_url = urllib.parse.urlparse( url).scheme + "://" + urllib.parse.urlparse(url).hostname atags = html.findAll("a") if not atags: update_output(f"[-] {url[:40]} | NO <a> TAGS FOUND") for a in atags: href = a.get("href") if href and any(keyword in href for keyword in ["xml", "rss", "feed", "atom"]): if is_rss_feed(href): result.add(resolve_url(url, href)) update_output( f"[-] {url[:40]} | [MATCHED] <a> TAG") # Check common RSS feed routes if no feed links were found if not result: update_output(f"[ ] {url[:40]} | NO LINKS FOUND, CHECKING ROUTES") routes = [ "atom.xml", "feed.xml", "rss.xml", "index.xml", "atom.json", "feed.json", "rss.json", "index.json", "feed/", "rss/", "feed/index.xml", "rss/index.xml", "feed/index.json", "rss/index.json", "feed/atom", "rss/atom", "feed/rss", "rss/rss", "feed/atom.xml", "rss/atom.xml", "feed/rss.xml", "rss/rss.xml", "feed/atom.json", "rss/atom.json", "feed/rss.json", "rss/rss.json", ] for route in routes: update_output(f"[ ] {url[:40]} | CHECKING ROUTE `{route}`") try: href = base_url + "/" + route if is_rss_feed(href): result.add(href) update_output(f"[-] {url[:40]} | [MATCHED] ROUTE `{route}`") break except Exception as e: logging.error(f"Error parsing {href}: {e}") update_output( f"[!] {url[:40]} | [ERROR] PARSING ROUTE") continue # Parse the URLs in the result set for feed_url in list(result): f = feedparser.parse(feed_url) if f.entries: result.add(feed_url) update_output(f"[x] {url[:40]} | PARSED: [SUCCESS] 😁") result = list(result) update_output(f"") return result
Then I handle the output and copy the first URL to the clipboard:
rss_link = find_rss_links(url) if rss_link: rl = rss_link[0] pyperclip.copy(rl) for i, url in enumerate(rss_link): rss_link[i] = f"[{i + 1}] {url}" print(f"{i + 1}. {url}") print(f"[{time.time() - timer:.2f}s]") else: update_output(f"[{time.time() - timer:.2f}s] {url[:40]} | NO RSS LINKS FOUND")
I've assigned the script to an alias
in my ~/.zshrc
file:
alias rss="python3 ${LOCATION_OF_SCRIPT}/rss.py"
When it's run, it will output the RSS feed links it finds on the page. I've also added a timer
to the script to show how long it took to run.
Example output
Hope this helps you in your data endeavors. Happy coding.