Python Script to Find RSS

Below is a script, written in Python, that will find RSS links from any given URL. First it'll check for any <link> tags with rel="alternate" and then it'll check for any <a> tags. If it finds any, it'll parse them and return the results. If it still hasn't found anything, then the script will attempt common routes.

I use this to rapidly find RSS feeds on the fly from the command line. If the script matches any links, it'll copy the first URL response to your clipboard.

There's additional, but optional logging and error handling in the script.

Header

You'll need to install the following packages:

pip install requests feedparser pyperclip bs4

After that, add the following to the top of your script:

import time
import pyperclip
import requests
import feedparser
import logging
import urllib.parse
from bs4 import BeautifulSoup

# Request the URL with an input prompt
url = input("Enter the URL: ")
timer = time.time()

Output

This is a simple function that will output the message to the console. It's used to clear the line and overwrite it with spaces, and to move the cursor back to the beginning of the line.

# Output
def update_output(message, clear=True):
    if len(message) > 80:
        message = message[:80]
    if clear:
        # Clear the line by overwriting with spaces
        sys.stdout.write('\r' + ' ' * 80)
        sys.stdout.write('\r')  # Move the cursor back to the beginning
        sys.stdout.write(message)  # Write the new message on the same line
    else:
        sys.stdout.write('\n' + message)  # Write the message on a new line
    sys.stdout.flush()  # Force the writing to flush

Primary Function

# Find RSS Links on Page
def find_rss_links(url):

    print('Searching for RSS Links')
    update_output(f"[ ] {url[:80]} | STARTING", False)

    if not url.startswith("http"):
        url = "https://" + url
        update_output(f"[ ] {url[:40]} | ADDED `https://` TO URL")

    try:
        update_output(f"[ ] {url[:40]} | FETCHING URL")
        raw = requests.get(url, timeout=3).text
        html = BeautifulSoup(raw, "html.parser")
    except requests.RequestException as e:
        logging.error(f"Error fetching {url}: {e}")
        update_output(f"[!] {url[:40]} | [ERROR] FETCHING")
        return convert_to_json([])

    # Extract RSS feed links from <link> tags with rel="alternate"
    feed_urls = html.findAll("link", rel="alternate")
    result = set()

    update_output(f"[ ] {url[:40]} | FOUND {len(feed_urls)} <link> TAGS")

    for f in feed_urls:
        feed_type = f.get("type")
        if feed_type and ("rss" in feed_type or "xml" in feed_type):
            href = f.get("href")
            if href:
                result.add(resolve_url(url, href))
                update_output(f"[-] {url[:40]} | [MATCHED] <link> TAG")

    # Extract RSS feed links from <a> tags
    if not result:
        update_output(f"[ ] {url[:40]} | NO LINKS FOUND, CHECKING <a> TAGS")

        base_url = urllib.parse.urlparse(
            url).scheme + "://" + urllib.parse.urlparse(url).hostname
        atags = html.findAll("a")

        if not atags:
            update_output(f"[-] {url[:40]} | NO <a> TAGS FOUND")

        for a in atags:
            href = a.get("href")
            if href and any(keyword in href for keyword in ["xml", "rss", "feed", "atom"]):
                if is_rss_feed(href):
                    result.add(resolve_url(url, href))
                    update_output(
                        f"[-] {url[:40]} | [MATCHED] <a> TAG")

    # Check common RSS feed routes if no feed links were found
    if not result:
        update_output(f"[ ] {url[:40]} | NO LINKS FOUND, CHECKING ROUTES")

        routes = [
            "atom.xml", "feed.xml", "rss.xml", "index.xml",
            "atom.json", "feed.json", "rss.json", "index.json",
            "feed/", "rss/", "feed/index.xml", "rss/index.xml", "feed/index.json", "rss/index.json",
            "feed/atom", "rss/atom", "feed/rss", "rss/rss",
            "feed/atom.xml", "rss/atom.xml", "feed/rss.xml", "rss/rss.xml",
            "feed/atom.json", "rss/atom.json", "feed/rss.json", "rss/rss.json",
        ]
        for route in routes:
            update_output(f"[ ] {url[:40]} | CHECKING ROUTE `{route}`")
            try:
                href = base_url + "/" + route
                if is_rss_feed(href):
                    result.add(href)
                    update_output(f"[-] {url[:40]} | [MATCHED] ROUTE `{route}`")
                    break
            except Exception as e:
                logging.error(f"Error parsing {href}: {e}")
                update_output(
                    f"[!] {url[:40]} | [ERROR] PARSING ROUTE")
                continue

    # Parse the URLs in the result set
    for feed_url in list(result):
        f = feedparser.parse(feed_url)
        if f.entries:
            result.add(feed_url)
            update_output(f"[x] {url[:40]} | PARSED: [SUCCESS] 😁")

    result = list(result)
    update_output(f"")

    return result

Then I handle the output and copy the first URL to the clipboard:

rss_link = find_rss_links(url)

if rss_link:
    rl = rss_link[0]
    pyperclip.copy(rl)
    for i, url in enumerate(rss_link):
        rss_link[i] = f"[{i + 1}] {url}"
        print(f"{i + 1}. {url}")
    print(f"[{time.time() - timer:.2f}s]")
else:
    update_output(f"[{time.time() - timer:.2f}s] {url[:40]} | NO RSS LINKS FOUND")

I've assigned the script to an alias in my ~/.zshrc file:

alias rss="python3 ${LOCATION_OF_SCRIPT}/rss.py"

When it's run, it will output the RSS feed links it finds on the page. I've also added a timer to the script to show how long it took to run.

Example output

Hope this helps you in your data endeavors. Happy coding.