"""YouTube RSS feed parser module. This module handles fetching and parsing YouTube channel RSS feeds, with filtering capabilities to exclude unwanted content like Shorts. """ from datetime import datetime import feedparser from typing import Dict, List, Optional import re from sqlalchemy.orm import Session from sqlalchemy.exc import IntegrityError from models import Channel, VideoEntry class FeedEntry: """Represents a single entry in a YouTube RSS feed.""" def __init__(self, title: str, video_url: str, video_id: str, published_at: datetime, thumbnail_url: Optional[str] = None, description: Optional[str] = None): self.title = title self.video_url = video_url self.video_id = video_id self.published_at = published_at self.thumbnail_url = thumbnail_url self.description = description def to_dict(self) -> Dict: """Convert entry to dictionary.""" return { "title": self.title, "video_url": self.video_url, "video_id": self.video_id, "published_at": self.published_at.isoformat(), "thumbnail_url": self.thumbnail_url, "description": self.description } class YouTubeFeedParser: """Parser for YouTube channel RSS feeds.""" BASE_URL = "https://www.youtube.com/feeds/videos.xml" def __init__(self, channel_id: str): """Initialize parser with a YouTube channel ID. Args: channel_id: The YouTube channel ID to fetch feeds from """ self.channel_id = channel_id self.url = f"{self.BASE_URL}?channel_id={channel_id}" def fetch_feed(self, filter_shorts: bool = True) -> Optional[Dict]: """Fetch and parse the RSS feed. Args: filter_shorts: If True, exclude YouTube Shorts from results Returns: Dictionary containing feed metadata and entries, or None if fetch fails """ feed = feedparser.parse(self.url) if feed.status != 200: return None entries = [] for entry in feed.entries: if filter_shorts and "shorts" in entry.link.lower(): continue # Extract video ID from URL video_id = self._extract_video_id(entry.link) if not video_id: continue # Get thumbnail URL (YouTube provides this in media:group) thumbnail_url = None if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail: thumbnail_url = entry.media_thumbnail[0]['url'] # Get description description = None if hasattr(entry, 'summary'): description = entry.summary # Parse published date published_at = datetime(*entry.published_parsed[:6]) entries.append(FeedEntry( title=entry.title, video_url=entry.link, video_id=video_id, published_at=published_at, thumbnail_url=thumbnail_url, description=description )) return { "feed_title": feed.feed.title, "feed_link": feed.feed.link, "rss_url": self.url, "entries": [entry.to_dict() for entry in entries] } @staticmethod def _extract_video_id(url: str) -> Optional[str]: """Extract video ID from YouTube URL. Args: url: YouTube video URL Returns: Video ID or None if not found """ # Match patterns like: youtube.com/watch?v=VIDEO_ID match = re.search(r'[?&]v=([a-zA-Z0-9_-]{11})', url) if match: return match.group(1) # Match patterns like: youtu.be/VIDEO_ID match = re.search(r'youtu\.be/([a-zA-Z0-9_-]{11})', url) if match: return match.group(1) return None def save_to_db(self, db_session: Session, feed_data: Dict, user_id: int) -> Channel: """Save feed data to the database. Args: db_session: SQLAlchemy database session feed_data: Dictionary containing feed metadata and entries (from fetch_feed) user_id: ID of the user subscribing to this channel Returns: The Channel model instance This method uses upsert logic: - Updates existing channel if it exists for this user - Creates new channel if it doesn't exist - Only inserts new video entries (ignores duplicates based on video_id and channel_id) """ # Get or create channel for this user channel = db_session.query(Channel).filter_by( user_id=user_id, channel_id=self.channel_id ).first() if channel: # Update existing channel channel.title = feed_data["feed_title"] channel.link = feed_data["feed_link"] channel.rss_url = feed_data["rss_url"] channel.last_fetched_at = datetime.utcnow() else: # Create new channel channel = Channel( user_id=user_id, channel_id=self.channel_id, title=feed_data["feed_title"], link=feed_data["feed_link"], rss_url=feed_data["rss_url"], last_fetched_at=datetime.utcnow() ) db_session.add(channel) db_session.flush() # Get the channel ID # Add video entries (ignore duplicates) for entry_data in feed_data["entries"]: # Check if video already exists for this channel existing = db_session.query(VideoEntry).filter_by( channel_id=channel.id, video_id=entry_data["video_id"] ).first() if not existing: # Parse published_at if it's a string published_at = entry_data["published_at"] if isinstance(published_at, str): published_at = datetime.fromisoformat(published_at.replace('Z', '+00:00')) video = VideoEntry( channel_id=channel.id, video_id=entry_data["video_id"], title=entry_data["title"], video_url=entry_data["video_url"], thumbnail_url=entry_data.get("thumbnail_url"), description=entry_data.get("description"), published_at=published_at, created_at=datetime.utcnow() ) db_session.add(video) db_session.commit() return channel