PsycologyAPI/web_spider/spiders/caltech.py

# -*- coding: utf-8 -*-
import re

from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler

from typing import List, Dict

from web_spider import HttpxSpider
import asyncio


class PsycologySpider(HttpxSpider):
    name = "Psycology"
    use_proxy_default = False
    index_url = ["https://www.xinli001.com/info/emot?page=1"]
    headers = {"Host": "www.xinli001.com",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5",
               "Accept-Encoding": "gzip",  # , deflate, br, zstd",
               "Referer": "https://www.xinli001.com",
               "DNT": "1",
               "Sec-GPC": "1", "Connection": "keep-alive",
               "Upgrade-Insecure-Requests": "1",
               "Sec-Fetch-Dest": "document",
               "Sec-Fetch-Mode": "navigate",
               "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1",
               "Priority": "u=0, i"}

    async def get_news_urls(self, page_cnt: int = 8) -> List[Dict | None]:
        news: List[Dict] = []
        s = self.get_session()
        for i in range(page_cnt):
            response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}")
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            article_list = soup.select_one("#articleListM")
            articles = article_list.find_all(class_='item')

            for article in articles:
                link_tag = article.find(class_="right").find('a', class_='title')

                title = link_tag.get_text().strip()
                if link_tag['href'].startswith("http"):
                    url = link_tag['href']
                else:
                    url = "https://" + self.headers['Host'] + link_tag['href']

                result_dict = {'title': title, 'url': url}
                news.append(result_dict)

        return news

    # def filter_content(self, content: str):
    #     # content = content.replace("#", "")
    #     content = content.replace("*", "")
    #     content = content.replace("> \n", "")
    #     content = content.replace(" \n", "\n")
    #     content = content.replace("\n\n", "\n")
    #     content = content.replace("\n\n", "\n")
    #     pattern = r'!\[\]\(https?://[^\s)]+\)'
    #     cleaned_text = re.sub(pattern, '', content)
    #     pattern2 = r'#+ '
    #     cleaned_text = re.sub(pattern2, '', cleaned_text)
    #     return cleaned_text

    async def get_news_content(self, urls: List[str]) -> List[str]:
        contents = []
        # s = self.get_session()
        # for url in urls:
        #     attempt = 0
        #     max_attempts = 3
        #     while attempt < max_attempts:
        #         try:
        #             response = await s.get(url)
        #             response.raise_for_status()
        #             contents.append(response.text)
        #             await asyncio.sleep(1)
        #             break  # 成功后跳出循环
        #         except Exception as e:
        #             attempt += 1
        #             if attempt == max_attempts:
        #                 print(f"Failed to fetch {url} after {max_attempts} attempts: {e}")
        #             await asyncio.sleep(1)  # 等待1秒后重试
        # await s.aclose()
        sessionid="asdf"
        async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler:
            for url in urls:
                result = await crawler.arun(
                    url=url,
                    page_timeout=self.timeout * 1000,
                    magic=True,
                    # bypass_cache=True,
                    wait_for="css:.yxl-editor-article ",
                    css_selector=".yxl-editor-article",
                    exclude_external_links=True,
                    session_id=sessionid,
                )
                # t = self.filter_content(result.markdown)
                # print(t)
                contents.append(result.markdown)
                # await asyncio.sleep(2)
        return contents


async def main():
    s = PsycologySpider()
    # urls = await s.get_news_urls(2)
    # print(urls)
    # print(len(urls))
    contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"])
    print(contents)


if __name__ == "__main__":
    import asyncio

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())