最终push

This commit is contained in:
高子兴 2024-12-28 21:01:29 +08:00
parent 69d751897d
commit dd3502d99d
5 changed files with 33 additions and 34 deletions

Binary file not shown.

32
main.py
View File

@ -33,24 +33,20 @@ async def lifespan_test(app: FastAPI) -> AsyncGenerator[None, None]:
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
if getattr(app.state, "testing", None): config = generate_config(
async with lifespan_test(app) as _: "sqlite://db.sqlite3",
yield app_modules={"models": ["models"]},
else: connection_label="models",
config = generate_config( )
"sqlite:///db.sqlite3", async with RegisterTortoise(
app_modules={"models": ["models"]}, app=app,
connection_label="models", config=config
) ):
async with RegisterTortoise( # db connected
app=app, yield
config=config, # app teardown
): # db connections closed
# db connected await Tortoise.close_connections()
yield
# app teardown
# db connections closed
await Tortoise.close_connections()
app = FastAPI(lifespan=lifespan) app = FastAPI(lifespan=lifespan)

Binary file not shown.

View File

@ -22,7 +22,8 @@ def initialize_spiders(package) -> List[BaseSpider]:
for importer, modname, ispkg in pkgutil.iter_modules(package.__path__): for importer, modname, ispkg in pkgutil.iter_modules(package.__path__):
module = importer.find_module(modname).load_module(modname) module = importer.find_module(modname).load_module(modname)
for name, obj in inspect.getmembers(module): for name, obj in inspect.getmembers(module):
if inspect.isclass(obj) and issubclass(obj, BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider: if inspect.isclass(obj) and issubclass(obj,
BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider:
objs.append(obj()) objs.append(obj())
return objs return objs
@ -107,6 +108,8 @@ async def run_spiders(page_cnt: int = 2, use_proxy: bool = False, threads: int =
record = await RawData.get(url=all_urls[j]) record = await RawData.get(url=all_urls[j])
record.content = content record.content = content
await record.save(using_db=conn) await record.save(using_db=conn)
# record = await RawData.get(id=1)
# print(record.title, record.content)
finally: finally:
await close_database() await close_database()

View File

@ -52,18 +52,18 @@ class PsycologySpider(HttpxSpider):
return news return news
def filter_content(self, content: str): # def filter_content(self, content: str):
# content = content.replace("#", "") # # content = content.replace("#", "")
content = content.replace("*", "") # content = content.replace("*", "")
content = content.replace("> \n", "") # content = content.replace("> \n", "")
content = content.replace(" \n", "\n") # content = content.replace(" \n", "\n")
content = content.replace("\n\n", "\n") # content = content.replace("\n\n", "\n")
content = content.replace("\n\n", "\n") # content = content.replace("\n\n", "\n")
pattern = r'!\[\]\(https?://[^\s)]+\)' # pattern = r'!\[\]\(https?://[^\s)]+\)'
cleaned_text = re.sub(pattern, '', content) # cleaned_text = re.sub(pattern, '', content)
pattern2 = r'#+ ' # pattern2 = r'#+ '
cleaned_text = re.sub(pattern2, '', cleaned_text) # cleaned_text = re.sub(pattern2, '', cleaned_text)
return cleaned_text # return cleaned_text
async def get_news_content(self, urls: List[str]) -> List[str]: async def get_news_content(self, urls: List[str]) -> List[str]:
contents = [] contents = []
@ -97,10 +97,10 @@ class PsycologySpider(HttpxSpider):
exclude_external_links=True, exclude_external_links=True,
session_id=sessionid, session_id=sessionid,
) )
t = self.filter_content(result.markdown) # t = self.filter_content(result.markdown)
# print(t) # print(t)
contents.append(t) contents.append(result.markdown)
await asyncio.sleep(2) # await asyncio.sleep(2)
return contents return contents