2026-03-23 00:59:47 +03:00

62 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio
import logging
import re
from html import unescape
from typing import List, Optional
import feedparser
from src.domain.news.entities import NewsItem
from src.domain.news.ports import NewsFetcher
from src.domain.subscriptions.entities import Subscription
logger = logging.getLogger(__name__)
def _strip_html(text: str) -> str:
"""Удаляет HTML-теги и декодирует HTML-сущности."""
text = re.sub(r'<[^>]+>', '', text)
return unescape(text).strip()
def _extract_image_url(entry) -> Optional[str]:
"""Возвращает URL первого enclosure с type image/*."""
for enc in entry.get("enclosures", []):
if enc.get("type", "").startswith("image/"):
return enc.get("href") or enc.get("url")
return None
class RssFetcher(NewsFetcher):
async def fetch(self, subscription: Subscription) -> List[NewsItem]:
"""
Опрашивает RSS-ленту. feedparser.parse() — блокирующий вызов,
поэтому запускается в executor чтобы не блокировать event loop.
"""
loop = asyncio.get_event_loop()
try:
feed = await loop.run_in_executor(None, feedparser.parse, subscription.source)
except Exception:
logger.exception("Ошибка при чтении RSS %s", subscription.source)
return []
items = []
for entry in feed.entries:
news_id = entry.get("id") or entry.get("link") or ""
title = entry.get("title", "(без заголовка)")
link = entry.get("link", "")
raw_summary = entry.get("summary")
summary = _strip_html(raw_summary) if raw_summary else None
image_url = _extract_image_url(entry)
if news_id:
items.append(NewsItem(
id=news_id,
title=title,
link=link,
summary=summary,
image_url=image_url,
))
return items