最终push
This commit is contained in:
parent
69d751897d
commit
dd3502d99d
BIN
db.sqlite3
BIN
db.sqlite3
Binary file not shown.
32
main.py
32
main.py
@ -33,24 +33,20 @@ async def lifespan_test(app: FastAPI) -> AsyncGenerator[None, None]:
|
|||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||||
if getattr(app.state, "testing", None):
|
config = generate_config(
|
||||||
async with lifespan_test(app) as _:
|
"sqlite://db.sqlite3",
|
||||||
yield
|
app_modules={"models": ["models"]},
|
||||||
else:
|
connection_label="models",
|
||||||
config = generate_config(
|
)
|
||||||
"sqlite:///db.sqlite3",
|
async with RegisterTortoise(
|
||||||
app_modules={"models": ["models"]},
|
app=app,
|
||||||
connection_label="models",
|
config=config
|
||||||
)
|
):
|
||||||
async with RegisterTortoise(
|
# db connected
|
||||||
app=app,
|
yield
|
||||||
config=config,
|
# app teardown
|
||||||
):
|
# db connections closed
|
||||||
# db connected
|
await Tortoise.close_connections()
|
||||||
yield
|
|
||||||
# app teardown
|
|
||||||
# db connections closed
|
|
||||||
await Tortoise.close_connections()
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(lifespan=lifespan)
|
app = FastAPI(lifespan=lifespan)
|
||||||
|
|||||||
Binary file not shown.
@ -22,7 +22,8 @@ def initialize_spiders(package) -> List[BaseSpider]:
|
|||||||
for importer, modname, ispkg in pkgutil.iter_modules(package.__path__):
|
for importer, modname, ispkg in pkgutil.iter_modules(package.__path__):
|
||||||
module = importer.find_module(modname).load_module(modname)
|
module = importer.find_module(modname).load_module(modname)
|
||||||
for name, obj in inspect.getmembers(module):
|
for name, obj in inspect.getmembers(module):
|
||||||
if inspect.isclass(obj) and issubclass(obj, BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider:
|
if inspect.isclass(obj) and issubclass(obj,
|
||||||
|
BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider:
|
||||||
objs.append(obj())
|
objs.append(obj())
|
||||||
return objs
|
return objs
|
||||||
|
|
||||||
@ -107,6 +108,8 @@ async def run_spiders(page_cnt: int = 2, use_proxy: bool = False, threads: int =
|
|||||||
record = await RawData.get(url=all_urls[j])
|
record = await RawData.get(url=all_urls[j])
|
||||||
record.content = content
|
record.content = content
|
||||||
await record.save(using_db=conn)
|
await record.save(using_db=conn)
|
||||||
|
# record = await RawData.get(id=1)
|
||||||
|
# print(record.title, record.content)
|
||||||
finally:
|
finally:
|
||||||
await close_database()
|
await close_database()
|
||||||
|
|
||||||
|
|||||||
@ -52,18 +52,18 @@ class PsycologySpider(HttpxSpider):
|
|||||||
|
|
||||||
return news
|
return news
|
||||||
|
|
||||||
def filter_content(self, content: str):
|
# def filter_content(self, content: str):
|
||||||
# content = content.replace("#", "")
|
# # content = content.replace("#", "")
|
||||||
content = content.replace("*", "")
|
# content = content.replace("*", "")
|
||||||
content = content.replace("> \n", "")
|
# content = content.replace("> \n", "")
|
||||||
content = content.replace(" \n", "\n")
|
# content = content.replace(" \n", "\n")
|
||||||
content = content.replace("\n\n", "\n")
|
# content = content.replace("\n\n", "\n")
|
||||||
content = content.replace("\n\n", "\n")
|
# content = content.replace("\n\n", "\n")
|
||||||
pattern = r'!\[\]\(https?://[^\s)]+\)'
|
# pattern = r'!\[\]\(https?://[^\s)]+\)'
|
||||||
cleaned_text = re.sub(pattern, '', content)
|
# cleaned_text = re.sub(pattern, '', content)
|
||||||
pattern2 = r'#+ '
|
# pattern2 = r'#+ '
|
||||||
cleaned_text = re.sub(pattern2, '', cleaned_text)
|
# cleaned_text = re.sub(pattern2, '', cleaned_text)
|
||||||
return cleaned_text
|
# return cleaned_text
|
||||||
|
|
||||||
async def get_news_content(self, urls: List[str]) -> List[str]:
|
async def get_news_content(self, urls: List[str]) -> List[str]:
|
||||||
contents = []
|
contents = []
|
||||||
@ -97,10 +97,10 @@ class PsycologySpider(HttpxSpider):
|
|||||||
exclude_external_links=True,
|
exclude_external_links=True,
|
||||||
session_id=sessionid,
|
session_id=sessionid,
|
||||||
)
|
)
|
||||||
t = self.filter_content(result.markdown)
|
# t = self.filter_content(result.markdown)
|
||||||
# print(t)
|
# print(t)
|
||||||
contents.append(t)
|
contents.append(result.markdown)
|
||||||
await asyncio.sleep(2)
|
# await asyncio.sleep(2)
|
||||||
return contents
|
return contents
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user