yottob/feed_parser.py

"""YouTube RSS feed parser module.

This module handles fetching and parsing YouTube channel RSS feeds,
with filtering capabilities to exclude unwanted content like Shorts.
"""

from datetime import datetime
import feedparser
from typing import Dict, List, Optional

from sqlalchemy.orm import Session
from sqlalchemy.exc import IntegrityError

from models import Channel, VideoEntry


class FeedEntry:
    """Represents a single entry in a YouTube RSS feed."""

    def __init__(self, title: str, link: str):
        self.title = title
        self.link = link

    def to_dict(self) -> Dict[str, str]:
        """Convert entry to dictionary."""
        return {
            "title": self.title,
            "link": self.link
        }


class YouTubeFeedParser:
    """Parser for YouTube channel RSS feeds."""

    BASE_URL = "https://www.youtube.com/feeds/videos.xml"

    def __init__(self, channel_id: str):
        """Initialize parser with a YouTube channel ID.

        Args:
            channel_id: The YouTube channel ID to fetch feeds from
        """
        self.channel_id = channel_id
        self.url = f"{self.BASE_URL}?channel_id={channel_id}"

    def fetch_feed(self, filter_shorts: bool = True) -> Optional[Dict]:
        """Fetch and parse the RSS feed.

        Args:
            filter_shorts: If True, exclude YouTube Shorts from results

        Returns:
            Dictionary containing feed metadata and entries, or None if fetch fails
        """
        feed = feedparser.parse(self.url)

        if feed.status != 200:
            return None

        entries = []
        for entry in feed.entries:
            if filter_shorts and "shorts" in entry.link:
                continue

            entries.append(FeedEntry(
                title=entry.title,
                link=entry.link
            ))

        return {
            "feed_title": feed.feed.title,
            "feed_link": feed.feed.link,
            "entries": [entry.to_dict() for entry in entries]
        }

    def save_to_db(self, db_session: Session, feed_data: Dict) -> Channel:
        """Save feed data to the database.

        Args:
            db_session: SQLAlchemy database session
            feed_data: Dictionary containing feed metadata and entries (from fetch_feed)

        Returns:
            The Channel model instance

        This method uses upsert logic:
        - Updates existing channel if it exists
        - Creates new channel if it doesn't exist
        - Only inserts new video entries (ignores duplicates)
        """
        # Get or create channel
        channel = db_session.query(Channel).filter_by(
            channel_id=self.channel_id
        ).first()

        if channel:
            # Update existing channel
            channel.title = feed_data["feed_title"]
            channel.link = feed_data["feed_link"]
            channel.last_fetched = datetime.utcnow()
        else:
            # Create new channel
            channel = Channel(
                channel_id=self.channel_id,
                title=feed_data["feed_title"],
                link=feed_data["feed_link"],
                last_fetched=datetime.utcnow()
            )
            db_session.add(channel)
            db_session.flush()  # Get the channel ID

        # Add video entries (ignore duplicates)
        for entry_data in feed_data["entries"]:
            # Check if video already exists
            existing = db_session.query(VideoEntry).filter_by(
                link=entry_data["link"]
            ).first()

            if not existing:
                video = VideoEntry(
                    channel_id=channel.id,
                    title=entry_data["title"],
                    link=entry_data["link"],
                    created_at=datetime.utcnow()
                )
                db_session.add(video)

        db_session.commit()
        return channel