# -*- coding: utf-8 -*- import re from bs4 import BeautifulSoup from crawl4ai import AsyncWebCrawler from typing import List, Dict from web_spider import HttpxSpider import asyncio class PsycologySpider(HttpxSpider): name = "Psycology" use_proxy_default = False index_url = ["https://www.xinli001.com/info/emot?page=1"] headers = {"Host": "www.xinli001.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip", # , deflate, br, zstd", "Referer": "https://www.xinli001.com", "DNT": "1", "Sec-GPC": "1", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1", "Priority": "u=0, i"} async def get_news_urls(self, page_cnt: int = 8) -> List[Dict | None]: news: List[Dict] = [] s = self.get_session() for i in range(page_cnt): response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}") response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') article_list = soup.select_one("#articleListM") articles = article_list.find_all(class_='item') for article in articles: link_tag = article.find(class_="right").find('a', class_='title') title = link_tag.get_text().strip() if link_tag['href'].startswith("http"): url = link_tag['href'] else: url = "https://" + self.headers['Host'] + link_tag['href'] result_dict = {'title': title, 'url': url} news.append(result_dict) return news def filter_content(self, content: str): # content = content.replace("#", "") content = content.replace("*", "") content = content.replace("> \n", "") content = content.replace(" \n", "\n") content = content.replace("\n\n", "\n") content = content.replace("\n\n", "\n") pattern = r'!\[\]\(https?://[^\s)]+\)' cleaned_text = re.sub(pattern, '', content) pattern2 = r'#+ ' cleaned_text = re.sub(pattern2, '', cleaned_text) return cleaned_text async def get_news_content(self, urls: List[str]) -> List[str]: contents = [] # s = self.get_session() # for url in urls: # attempt = 0 # max_attempts = 3 # while attempt < max_attempts: # try: # response = await s.get(url) # response.raise_for_status() # contents.append(response.text) # await asyncio.sleep(1) # break # 成功后跳出循环 # except Exception as e: # attempt += 1 # if attempt == max_attempts: # print(f"Failed to fetch {url} after {max_attempts} attempts: {e}") # await asyncio.sleep(1) # 等待1秒后重试 # await s.aclose() sessionid="asdf" async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler: for url in urls: result = await crawler.arun( url=url, page_timeout=self.timeout * 1000, magic=True, # bypass_cache=True, wait_for="css:.yxl-editor-article ", css_selector=".yxl-editor-article", exclude_external_links=True, session_id=sessionid, ) t = self.filter_content(result.markdown) # print(t) contents.append(t) await asyncio.sleep(2) return contents async def main(): s = PsycologySpider() # urls = await s.get_news_urls(2) # print(urls) # print(len(urls)) contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"]) print(contents) if __name__ == "__main__": import asyncio loop = asyncio.get_event_loop() loop.run_until_complete(main())