34 lines
982 B
Python
34 lines
982 B
Python
|
|
# -*- coding: utf-8 -*-
|
||
|
|
# @Time : 2024/11/15 下午4:09
|
||
|
|
# @Author : 河瞬
|
||
|
|
# @FileName: base_spider.py
|
||
|
|
# @Software: PyCharm
|
||
|
|
from typing import List, Dict
|
||
|
|
|
||
|
|
|
||
|
|
class BaseSpider:
|
||
|
|
name: str
|
||
|
|
index_url: List[str]
|
||
|
|
timeout: int
|
||
|
|
use_proxy_default: bool
|
||
|
|
|
||
|
|
def __init__(self, proxy_url: str = None, timeout: int = 6):
|
||
|
|
self.use_proxy = proxy_url is not None
|
||
|
|
self.proxy_url = proxy_url
|
||
|
|
self.timeout = timeout
|
||
|
|
|
||
|
|
def set_proxy(self, proxy_url: str):
|
||
|
|
self.use_proxy = True
|
||
|
|
self.proxy_url = proxy_url
|
||
|
|
|
||
|
|
async def get_news_urls(self, page_cnt: int) -> List[Dict | None]:
|
||
|
|
"""
|
||
|
|
|
||
|
|
:param page_cnt:
|
||
|
|
:return: List[Dict] 字典必须包含title和url两个key
|
||
|
|
"""
|
||
|
|
raise NotImplementedError("Subclass method not implemented:get_news_urls")
|
||
|
|
|
||
|
|
async def get_news_content(self, urls: List[str]) -> List[str | None]:
|
||
|
|
raise NotImplementedError("Subclass method not implemented:get_news_content")
|