62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import asyncio
|
||
import logging
|
||
import re
|
||
from html import unescape
|
||
from typing import List, Optional
|
||
|
||
import feedparser
|
||
|
||
from src.domain.news.entities import NewsItem
|
||
from src.domain.news.ports import NewsFetcher
|
||
from src.domain.subscriptions.entities import Subscription
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _strip_html(text: str) -> str:
|
||
"""Удаляет HTML-теги и декодирует HTML-сущности."""
|
||
text = re.sub(r'<[^>]+>', '', text)
|
||
return unescape(text).strip()
|
||
|
||
|
||
def _extract_image_url(entry) -> Optional[str]:
|
||
"""Возвращает URL первого enclosure с type image/*."""
|
||
for enc in entry.get("enclosures", []):
|
||
if enc.get("type", "").startswith("image/"):
|
||
return enc.get("href") or enc.get("url")
|
||
return None
|
||
|
||
|
||
class RssFetcher(NewsFetcher):
|
||
|
||
async def fetch(self, subscription: Subscription) -> List[NewsItem]:
|
||
"""
|
||
Опрашивает RSS-ленту. feedparser.parse() — блокирующий вызов,
|
||
поэтому запускается в executor чтобы не блокировать event loop.
|
||
"""
|
||
loop = asyncio.get_event_loop()
|
||
try:
|
||
feed = await loop.run_in_executor(None, feedparser.parse, subscription.source)
|
||
except Exception:
|
||
logger.exception("Ошибка при чтении RSS %s", subscription.source)
|
||
return []
|
||
|
||
items = []
|
||
for entry in feed.entries:
|
||
news_id = entry.get("id") or entry.get("link") or ""
|
||
title = entry.get("title", "(без заголовка)")
|
||
link = entry.get("link", "")
|
||
raw_summary = entry.get("summary")
|
||
summary = _strip_html(raw_summary) if raw_summary else None
|
||
image_url = _extract_image_url(entry)
|
||
if news_id:
|
||
items.append(NewsItem(
|
||
id=news_id,
|
||
title=title,
|
||
link=link,
|
||
summary=summary,
|
||
image_url=image_url,
|
||
))
|
||
|
||
return items
|