diff --git a/db.sqlite3 b/db.sqlite3 index f30f98d..a4d078e 100644 Binary files a/db.sqlite3 and b/db.sqlite3 differ diff --git a/main.py b/main.py index b7ca48d..d9fb2f9 100644 --- a/main.py +++ b/main.py @@ -33,24 +33,20 @@ async def lifespan_test(app: FastAPI) -> AsyncGenerator[None, None]: @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: - if getattr(app.state, "testing", None): - async with lifespan_test(app) as _: - yield - else: - config = generate_config( - "sqlite:///db.sqlite3", - app_modules={"models": ["models"]}, - connection_label="models", - ) - async with RegisterTortoise( - app=app, - config=config, - ): - # db connected - yield - # app teardown - # db connections closed - await Tortoise.close_connections() + config = generate_config( + "sqlite://db.sqlite3", + app_modules={"models": ["models"]}, + connection_label="models", + ) + async with RegisterTortoise( + app=app, + config=config + ): + # db connected + yield + # app teardown + # db connections closed + await Tortoise.close_connections() app = FastAPI(lifespan=lifespan) diff --git a/web_spider/db.sqlite3 b/web_spider/db.sqlite3 index f30f98d..a4d078e 100644 Binary files a/web_spider/db.sqlite3 and b/web_spider/db.sqlite3 differ diff --git a/web_spider/run_spiders.py b/web_spider/run_spiders.py index df28f1f..0fe79a9 100644 --- a/web_spider/run_spiders.py +++ b/web_spider/run_spiders.py @@ -22,7 +22,8 @@ def initialize_spiders(package) -> List[BaseSpider]: for importer, modname, ispkg in pkgutil.iter_modules(package.__path__): module = importer.find_module(modname).load_module(modname) for name, obj in inspect.getmembers(module): - if inspect.isclass(obj) and issubclass(obj, BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider: + if inspect.isclass(obj) and issubclass(obj, + BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider: objs.append(obj()) return objs @@ -107,6 +108,8 @@ async def run_spiders(page_cnt: int = 2, use_proxy: bool = False, threads: int = record = await RawData.get(url=all_urls[j]) record.content = content await record.save(using_db=conn) + # record = await RawData.get(id=1) + # print(record.title, record.content) finally: await close_database() diff --git a/web_spider/spiders/caltech.py b/web_spider/spiders/caltech.py index e19e96a..8eaf959 100644 --- a/web_spider/spiders/caltech.py +++ b/web_spider/spiders/caltech.py @@ -52,18 +52,18 @@ class PsycologySpider(HttpxSpider): return news - def filter_content(self, content: str): - # content = content.replace("#", "") - content = content.replace("*", "") - content = content.replace("> \n", "") - content = content.replace(" \n", "\n") - content = content.replace("\n\n", "\n") - content = content.replace("\n\n", "\n") - pattern = r'!\[\]\(https?://[^\s)]+\)' - cleaned_text = re.sub(pattern, '', content) - pattern2 = r'#+ ' - cleaned_text = re.sub(pattern2, '', cleaned_text) - return cleaned_text + # def filter_content(self, content: str): + # # content = content.replace("#", "") + # content = content.replace("*", "") + # content = content.replace("> \n", "") + # content = content.replace(" \n", "\n") + # content = content.replace("\n\n", "\n") + # content = content.replace("\n\n", "\n") + # pattern = r'!\[\]\(https?://[^\s)]+\)' + # cleaned_text = re.sub(pattern, '', content) + # pattern2 = r'#+ ' + # cleaned_text = re.sub(pattern2, '', cleaned_text) + # return cleaned_text async def get_news_content(self, urls: List[str]) -> List[str]: contents = [] @@ -97,10 +97,10 @@ class PsycologySpider(HttpxSpider): exclude_external_links=True, session_id=sessionid, ) - t = self.filter_content(result.markdown) + # t = self.filter_content(result.markdown) # print(t) - contents.append(t) - await asyncio.sleep(2) + contents.append(result.markdown) + # await asyncio.sleep(2) return contents