PsycologyAPI/web_spider/spiders/caltech.py

# -*- coding: utf-8 -*-
import re

from bs4 import BeautifulSoup
from crawl4ai import AsyncWebCrawler

from typing import List, Dict

from web_spider import HttpxSpider
import asyncio


class PsycologySpider(HttpxSpider):
    name = "Psycology"
    use_proxy_default = False
    index_url = ["https://www.xinli001.com/info/emot?page=1"]
    headers = {"Host": "www.xinli001.com",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",
               "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language": "en-US,en;q=0.5",
               "Accept-Encoding": "gzip",  # , deflate, br, zstd",
               "Referer": "https://www.xinli001.com",
               "DNT": "1",
               "Sec-GPC": "1", "Connection": "keep-alive",
               "Upgrade-Insecure-Requests": "1",
               "Sec-Fetch-Dest": "document",
               "Sec-Fetch-Mode": "navigate",
               "Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1",
               "Priority": "u=0, i"}

    async def get_news_urls(self, page_cnt: int = 8) -> List[Dict | None]:
        news: List[Dict] = []
        s = self.get_session()
        for i in range(page_cnt):
            response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}")
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            article_list = soup.select_one("#articleListM")
            articles = article_list.find_all(class_='item')

            for article in articles:
                link_tag = article.find(class_="right").find('a', class_='title')

                title = link_tag.get_text().strip()
                if link_tag['href'].startswith("http"):
                    url = link_tag['href']
                else:
                    url = "https://" + self.headers['Host'] + link_tag['href']

                result_dict = {'title': title, 'url': url}
                news.append(result_dict)

        return news

    def filter_content(self, content: str):
        # content = content.replace("#", "")
        content = content.replace("*", "")
        content = content.replace("> \n", "")
        content = content.replace(" \n", "\n")
        content = content.replace("\n\n", "\n")
        content = content.replace("\n\n", "\n")
        pattern = r'!\[\]\(https?://[^\s)]+\)'
        cleaned_text = re.sub(pattern, '', content)
        pattern2 = r'#+ '
        cleaned_text = re.sub(pattern2, '', cleaned_text)
        return cleaned_text

    async def get_news_content(self, urls: List[str]) -> List[str]:
        contents = []
        # s = self.get_session()
        # for url in urls:
        #     attempt = 0
        #     max_attempts = 3
        #     while attempt < max_attempts:
        #         try:
        #             response = await s.get(url)
        #             response.raise_for_status()
        #             contents.append(response.text)
        #             await asyncio.sleep(1)
        #             break  # 成功后跳出循环
        #         except Exception as e:
        #             attempt += 1
        #             if attempt == max_attempts:
        #                 print(f"Failed to fetch {url} after {max_attempts} attempts: {e}")
        #             await asyncio.sleep(1)  # 等待1秒后重试
        # await s.aclose()
        sessionid="asdf"
        async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler:
            for url in urls:
                result = await crawler.arun(
                    url=url,
                    page_timeout=self.timeout * 1000,
                    magic=True,
                    # bypass_cache=True,
                    wait_for="css:.yxl-editor-article ",
                    css_selector=".yxl-editor-article",
                    exclude_external_links=True,
                    session_id=sessionid,
                )
                t = self.filter_content(result.markdown)
                # print(t)
                contents.append(t)
                await asyncio.sleep(2)
        return contents


async def main():
    s = PsycologySpider()
    # urls = await s.get_news_urls(2)
    # print(urls)
    # print(len(urls))
    contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"])
    print(contents)


if __name__ == "__main__":
    import asyncio

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
全给我上去！ 2024-11-29 09:09:32 +00:00			`# -- coding: utf-8 --`
			`import re`

			`from bs4 import BeautifulSoup`
			`from crawl4ai import AsyncWebCrawler`

			`from typing import List, Dict`

			`from web_spider import HttpxSpider`
			`import asyncio`


			`class PsycologySpider(HttpxSpider):`
			`name = "Psycology"`
			`use_proxy_default = False`
			`index_url = ["https://www.xinli001.com/info/emot?page=1"]`
			`headers = {"Host": "www.xinli001.com",`
			`"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0",`
			`"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",`
			`"Accept-Language": "en-US,en;q=0.5",`
			`"Accept-Encoding": "gzip", # , deflate, br, zstd",`
			`"Referer": "https://www.xinli001.com",`
			`"DNT": "1",`
			`"Sec-GPC": "1", "Connection": "keep-alive",`
			`"Upgrade-Insecure-Requests": "1",`
			`"Sec-Fetch-Dest": "document",`
			`"Sec-Fetch-Mode": "navigate",`
			`"Sec-Fetch-Site": "same-origin", "Sec-Fetch-User": "?1",`
			`"Priority": "u=0, i"}`

			`async def get_news_urls(self, page_cnt: int = 8) -> List[Dict \| None]:`
			`news: List[Dict] = []`
			`s = self.get_session()`
			`for i in range(page_cnt):`
			`response = await s.get(f"https://www.xinli001.com/info/emot?page={i + 1}")`
			`response.raise_for_status()`
			`soup = BeautifulSoup(response.text, 'html.parser')`
			`article_list = soup.select_one("#articleListM")`
			`articles = article_list.find_all(class_='item')`

			`for article in articles:`
			`link_tag = article.find(class_="right").find('a', class_='title')`

			`title = link_tag.get_text().strip()`
			`if link_tag['href'].startswith("http"):`
			`url = link_tag['href']`
			`else:`
			`url = "https://" + self.headers['Host'] + link_tag['href']`

			`result_dict = {'title': title, 'url': url}`
			`news.append(result_dict)`

			`return news`

			`def filter_content(self, content: str):`
			`# content = content.replace("#", "")`
			`content = content.replace("*", "")`
			`content = content.replace("> \n", "")`
			`content = content.replace(" \n", "\n")`
			`content = content.replace("\n\n", "\n")`
			`content = content.replace("\n\n", "\n")`
			`pattern = r'!\[\]\(https?://[^\s)]+\)'`
			`cleaned_text = re.sub(pattern, '', content)`
			`pattern2 = r'#+ '`
			`cleaned_text = re.sub(pattern2, '', cleaned_text)`
			`return cleaned_text`

			`async def get_news_content(self, urls: List[str]) -> List[str]:`
			`contents = []`
			`# s = self.get_session()`
			`# for url in urls:`
			`# attempt = 0`
			`# max_attempts = 3`
			`# while attempt < max_attempts:`
			`# try:`
			`# response = await s.get(url)`
			`# response.raise_for_status()`
			`# contents.append(response.text)`
			`# await asyncio.sleep(1)`
			`# break # 成功后跳出循环`
			`# except Exception as e:`
			`# attempt += 1`
			`# if attempt == max_attempts:`
			`# print(f"Failed to fetch {url} after {max_attempts} attempts: {e}")`
			`# await asyncio.sleep(1) # 等待1秒后重试`
			`# await s.aclose()`
			`sessionid="asdf"`
			`async with AsyncWebCrawler(proxy=self.proxy_url if self.use_proxy else None) as crawler:`
			`for url in urls:`
			`result = await crawler.arun(`
			`url=url,`
			`page_timeout=self.timeout * 1000,`
			`magic=True,`
			`# bypass_cache=True,`
			`wait_for="css:.yxl-editor-article ",`
			`css_selector=".yxl-editor-article",`
			`exclude_external_links=True,`
			`session_id=sessionid,`
			`)`
			`t = self.filter_content(result.markdown)`
			`# print(t)`
			`contents.append(t)`
			`await asyncio.sleep(2)`
			`return contents`


			`async def main():`
			`s = PsycologySpider()`
			`# urls = await s.get_news_urls(2)`
			`# print(urls)`
			`# print(len(urls))`
			`contents = await s.get_news_content(["https://www.xinli001.com/info/100497752"])`
			`print(contents)`


			`if __name__ == "__main__":`
			`import asyncio`

			`loop = asyncio.get_event_loop()`
			`loop.run_until_complete(main())`