PsycologyAPI/web_spider/spiders/caltech.py

121 lines
4.4 KiB
Python
Raw Normal View History

2024-11-29 09:09:32 +00:00
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler
from typing import List, Dict
from web_spider import HttpxSpider
import asyncio
class PsycologySpider(HttpxSpider):
name = "Psycology"
use_proxy_default = False
index_url = ["https://www.xinli001.com/info/emot?page=1"]
headers = {"Host": "www.xinli001.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip", # , deflate, br, zstd",
"Referer": "https://www.xinli001.com",
"DNT": "1",
"Sec-GPC": "1", "Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1",
"Priority": "u=0, i"}
async def get_news_urls(self, page_cnt: int = 8) -> List[Dict | None]:
news: List[Dict] = []
s = self.get_session()
for i in range(page_cnt):
response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}")
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
article_list = soup.select_one("#articleListM")
articles = article_list.find_all(class_='item')
for article in articles:
link_tag = article.find(class_="right").find('a', class_='title')
title = link_tag.get_text().strip()
if link_tag['href'].startswith("http"):
url = link_tag['href']
else:
url = "https://" + self.headers['Host'] + link_tag['href']
result_dict = {'title': title, 'url': url}
news.append(result_dict)
return news
def filter_content(self, content: str):
# content = content.replace("#", "")
content = content.replace("*", "")
content = content.replace("> \n", "")
content = content.replace(" \n", "\n")
content = content.replace("\n\n", "\n")
content = content.replace("\n\n", "\n")
pattern = r'!\[\]\(https?://[^\s)]+\)'
cleaned_text = re.sub(pattern, '', content)
pattern2 = r'#+ '
cleaned_text = re.sub(pattern2, '', cleaned_text)
return cleaned_text
async def get_news_content(self, urls: List[str]) -> List[str]:
contents = []
# s = self.get_session()
# for url in urls:
# attempt = 0
# max_attempts = 3
# while attempt < max_attempts:
# try:
# response = await s.get(url)
# response.raise_for_status()
# contents.append(response.text)
# await asyncio.sleep(1)
# break # 成功后跳出循环
# except Exception as e:
# attempt += 1
# if attempt == max_attempts:
# print(f"Failed to fetch {url} after {max_attempts} attempts: {e}")
# await asyncio.sleep(1) # 等待1秒后重试
# await s.aclose()
sessionid="asdf"
async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler:
for url in urls:
result = await crawler.arun(
url=url,
page_timeout=self.timeout * 1000,
magic=True,
# bypass_cache=True,
wait_for="css:.yxl-editor-article ",
css_selector=".yxl-editor-article",
exclude_external_links=True,
session_id=sessionid,
)
t = self.filter_content(result.markdown)
# print(t)
contents.append(t)
await asyncio.sleep(2)
return contents
async def main():
s = PsycologySpider()
# urls = await s.get_news_urls(2)
# print(urls)
# print(len(urls))
contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"])
print(contents)
if __name__ == "__main__":
import asyncio
loop = asyncio.get_event_loop()
loop.run_until_complete(main())