121 lines
4.5 KiB
Python
121 lines
4.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
from crawl4ai import AsyncWebCrawler
|
|
|
|
from typing import List, Dict
|
|
|
|
from web_spider import HttpxSpider
|
|
import asyncio
|
|
|
|
|
|
class PsycologySpider(HttpxSpider):
|
|
name = "Psycology"
|
|
use_proxy_default = False
|
|
index_url = ["https://www.xinli001.com/info/emot?page=1"]
|
|
headers = {"Host": "www.xinli001.com",
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip", # , deflate, br, zstd",
|
|
"Referer": "https://www.xinli001.com",
|
|
"DNT": "1",
|
|
"Sec-GPC": "1", "Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1",
|
|
"Priority": "u=0, i"}
|
|
|
|
async def get_news_urls(self, page_cnt: int = 8) -> List[Dict | None]:
|
|
news: List[Dict] = []
|
|
s = self.get_session()
|
|
for i in range(page_cnt):
|
|
response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}")
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
article_list = soup.select_one("#articleListM")
|
|
articles = article_list.find_all(class_='item')
|
|
|
|
for article in articles:
|
|
link_tag = article.find(class_="right").find('a', class_='title')
|
|
|
|
title = link_tag.get_text().strip()
|
|
if link_tag['href'].startswith("http"):
|
|
url = link_tag['href']
|
|
else:
|
|
url = "https://" + self.headers['Host'] + link_tag['href']
|
|
|
|
result_dict = {'title': title, 'url': url}
|
|
news.append(result_dict)
|
|
|
|
return news
|
|
|
|
# def filter_content(self, content: str):
|
|
# # content = content.replace("#", "")
|
|
# content = content.replace("*", "")
|
|
# content = content.replace("> \n", "")
|
|
# content = content.replace(" \n", "\n")
|
|
# content = content.replace("\n\n", "\n")
|
|
# content = content.replace("\n\n", "\n")
|
|
# pattern = r'!\[\]\(https?://[^\s)]+\)'
|
|
# cleaned_text = re.sub(pattern, '', content)
|
|
# pattern2 = r'#+ '
|
|
# cleaned_text = re.sub(pattern2, '', cleaned_text)
|
|
# return cleaned_text
|
|
|
|
async def get_news_content(self, urls: List[str]) -> List[str]:
|
|
contents = []
|
|
# s = self.get_session()
|
|
# for url in urls:
|
|
# attempt = 0
|
|
# max_attempts = 3
|
|
# while attempt < max_attempts:
|
|
# try:
|
|
# response = await s.get(url)
|
|
# response.raise_for_status()
|
|
# contents.append(response.text)
|
|
# await asyncio.sleep(1)
|
|
# break # 成功后跳出循环
|
|
# except Exception as e:
|
|
# attempt += 1
|
|
# if attempt == max_attempts:
|
|
# print(f"Failed to fetch {url} after {max_attempts} attempts: {e}")
|
|
# await asyncio.sleep(1) # 等待1秒后重试
|
|
# await s.aclose()
|
|
sessionid="asdf"
|
|
async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler:
|
|
for url in urls:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
page_timeout=self.timeout * 1000,
|
|
magic=True,
|
|
# bypass_cache=True,
|
|
wait_for="css:.yxl-editor-article ",
|
|
css_selector=".yxl-editor-article",
|
|
exclude_external_links=True,
|
|
session_id=sessionid,
|
|
)
|
|
# t = self.filter_content(result.markdown)
|
|
# print(t)
|
|
contents.append(result.markdown)
|
|
# await asyncio.sleep(2)
|
|
return contents
|
|
|
|
|
|
async def main():
|
|
s = PsycologySpider()
|
|
# urls = await s.get_news_urls(2)
|
|
# print(urls)
|
|
# print(len(urls))
|
|
contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"])
|
|
print(contents)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import asyncio
|
|
|
|
loop = asyncio.get_event_loop()
|
|
loop.run_until_complete(main())
|