feat(01-02): create email body parser for multipart MIME messages
- Implemented parse_email_body function for RFC822 email parsing - Uses stdlib email.message_from_bytes with modern EmailMessage API - Extracts text and HTML bodies using get_body() method - Prefers plain text over HTML for "preferred" field - Converts HTML to text using html2text when text body missing - Extracts all metadata: subject, from, to, date, message_id - Uses parsedate_to_datetime for proper date parsing - Handles UnicodeDecodeError gracefully with partial data return - Follows async patterns and logging conventions from existing codebase
This commit is contained in:
123
blueprints/email/parser_service.py
Normal file
123
blueprints/email/parser_service.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Email body parsing service for multipart MIME messages.
|
||||
|
||||
Extracts text and HTML bodies from RFC822 email format, converts HTML to text
|
||||
when needed, and extracts email metadata (subject, from, to, date, message-id).
|
||||
"""
|
||||
|
||||
import logging
|
||||
from email import message_from_bytes
|
||||
from email.policy import default
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
import html2text
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def parse_email_body(raw_email_bytes: bytes) -> dict:
|
||||
"""
|
||||
Extract text and HTML bodies from RFC822 email bytes.
|
||||
|
||||
Args:
|
||||
raw_email_bytes: Raw email message bytes from IMAP FETCH
|
||||
|
||||
Returns:
|
||||
Dictionary with keys:
|
||||
- "text": Plain text body (None if not present)
|
||||
- "html": HTML body (None if not present)
|
||||
- "preferred": Best available body (text preferred, HTML converted if text missing)
|
||||
- "subject": Email subject
|
||||
- "from": Sender address
|
||||
- "to": Recipient address(es)
|
||||
- "date": Parsed datetime object (None if missing/invalid)
|
||||
- "message_id": RFC822 Message-ID header
|
||||
|
||||
Note:
|
||||
Uses modern EmailMessage API with email.policy.default for proper
|
||||
encoding handling. Prefers plain text over HTML for RAG indexing.
|
||||
"""
|
||||
logger.info("[EMAIL PARSER] Parsing email message")
|
||||
|
||||
try:
|
||||
# Parse with modern EmailMessage API and default policy
|
||||
msg = message_from_bytes(raw_email_bytes, policy=default)
|
||||
|
||||
result = {
|
||||
"text": None,
|
||||
"html": None,
|
||||
"preferred": None,
|
||||
"subject": "",
|
||||
"from": "",
|
||||
"to": "",
|
||||
"date": None,
|
||||
"message_id": "",
|
||||
}
|
||||
|
||||
# Extract plain text body
|
||||
text_part = msg.get_body(preferencelist=("plain",))
|
||||
if text_part:
|
||||
# Use get_content() for proper decoding (not get_payload())
|
||||
result["text"] = text_part.get_content()
|
||||
logger.debug("[EMAIL PARSER] Found plain text body")
|
||||
|
||||
# Extract HTML body
|
||||
html_part = msg.get_body(preferencelist=("html",))
|
||||
if html_part:
|
||||
result["html"] = html_part.get_content()
|
||||
logger.debug("[EMAIL PARSER] Found HTML body")
|
||||
|
||||
# Determine preferred version (text preferred for RAG)
|
||||
if result["text"]:
|
||||
result["preferred"] = result["text"]
|
||||
logger.debug("[EMAIL PARSER] Using plain text as preferred")
|
||||
elif result["html"]:
|
||||
# Convert HTML to text using html2text
|
||||
h = html2text.HTML2Text()
|
||||
h.ignore_links = False # Keep links for context
|
||||
result["preferred"] = h.handle(result["html"])
|
||||
logger.debug("[EMAIL PARSER] Converted HTML to text for preferred")
|
||||
else:
|
||||
logger.warning(
|
||||
"[EMAIL PARSER] No body content found (neither text nor HTML)"
|
||||
)
|
||||
|
||||
# Extract metadata
|
||||
result["subject"] = msg.get("subject", "")
|
||||
result["from"] = msg.get("from", "")
|
||||
result["to"] = msg.get("to", "")
|
||||
result["message_id"] = msg.get("message-id", "")
|
||||
|
||||
# Parse date header
|
||||
date_header = msg.get("date")
|
||||
if date_header:
|
||||
try:
|
||||
result["date"] = parsedate_to_datetime(date_header)
|
||||
except Exception as date_error:
|
||||
logger.warning(
|
||||
f"[EMAIL PARSER] Failed to parse date header '{date_header}': {date_error}"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"[EMAIL PARSER] Successfully parsed email: subject='{result['subject']}', from='{result['from']}'"
|
||||
)
|
||||
return result
|
||||
|
||||
except UnicodeDecodeError as e:
|
||||
logger.error(f"[EMAIL PARSER] Unicode decode error: {str(e)}")
|
||||
# Return partial data with error indication
|
||||
return {
|
||||
"text": None,
|
||||
"html": None,
|
||||
"preferred": None,
|
||||
"subject": "[Encoding Error]",
|
||||
"from": "",
|
||||
"to": "",
|
||||
"date": None,
|
||||
"message_id": "",
|
||||
"error": str(e),
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"[EMAIL PARSER] Unexpected error: {type(e).__name__}: {str(e)}")
|
||||
logger.exception("[EMAIL PARSER] Full traceback:")
|
||||
raise
|
||||
Reference in New Issue
Block a user