最终push

2024-12-28 21:01:29 +08:00 · 2024-12-28 21:01:29 +08:00 · dd3502d99d
commit dd3502d99d
parent 69d751897d
5 changed files with 33 additions and 34 deletions
--- a/db.sqlite3
+++ b/db.sqlite3
--- a/main.py
+++ b/main.py
@ -33,24 +33,20 @@ async def lifespan_test(app: FastAPI) -> AsyncGenerator[None, None]:
@asynccontextmanager
 async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
-    if getattr(app.state, "testing", None):
+    config = generate_config(
-        async with lifespan_test(app) as _:
+        "sqlite://db.sqlite3",
-            yield
+        app_modules={"models": ["models"]},
-    else:
+        connection_label="models",
-        config = generate_config(
+    )
-            "sqlite:///db.sqlite3",
+    async with RegisterTortoise(
-            app_modules={"models": ["models"]},
+            app=app,
-            connection_label="models",
+            config=config
-        )
+    ):
-        async with RegisterTortoise(
+        # db connected
-                app=app,
+        yield
-                config=config,
+        # app teardown
-        ):
+    # db connections closed
-            # db connected
+    await Tortoise.close_connections()
            yield
            # app teardown
        # db connections closed
        await Tortoise.close_connections()
 app = FastAPI(lifespan=lifespan)
--- a/web_spider/db.sqlite3
+++ b/web_spider/db.sqlite3
--- a/web_spider/run_spiders.py
+++ b/web_spider/run_spiders.py
@ -22,7 +22,8 @@ def initialize_spiders(package) -> List[BaseSpider]:
    for importer, modname, ispkg in pkgutil.iter_modules(package.__path__):
        module = importer.find_module(modname).load_module(modname)
        for name, obj in inspect.getmembers(module):
-            if inspect.isclass(obj) and issubclass(obj, BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider:
+            if inspect.isclass(obj) and issubclass(obj,
                                                   BaseSpider) and obj is not BaseSpider and obj is not HttpxSpider:
                objs.append(obj())
    return objs
@ -107,6 +108,8 @@ async def run_spiders(page_cnt: int = 2, use_proxy: bool = False, threads: int =
                record = await RawData.get(url=all_urls[j])
                record.content = content
                await record.save(using_db=conn)
        # record = await RawData.get(id=1)
        # print(record.title, record.content)
    finally:
        await close_database()
--- a/web_spider/spiders/caltech.py
+++ b/web_spider/spiders/caltech.py
@ -52,18 +52,18 @@ class PsycologySpider(HttpxSpider):
        return news
-    def filter_content(self, content: str):
+    # def filter_content(self, content: str):
-        # content = content.replace("#", "")
+    #     # content = content.replace("#", "")
-        content = content.replace("*", "")
+    #     content = content.replace("*", "")
-        content = content.replace("> \n", "")
+    #     content = content.replace("> \n", "")
-        content = content.replace(" \n", "\n")
+    #     content = content.replace(" \n", "\n")
-        content = content.replace("\n\n", "\n")
+    #     content = content.replace("\n\n", "\n")
-        content = content.replace("\n\n", "\n")
+    #     content = content.replace("\n\n", "\n")
-        pattern = r'!\[\]\(https?://[^\s)]+\)'
+    #     pattern = r'!\[\]\(https?://[^\s)]+\)'
-        cleaned_text = re.sub(pattern, '', content)
+    #     cleaned_text = re.sub(pattern, '', content)
-        pattern2 = r'#+ '
+    #     pattern2 = r'#+ '
-        cleaned_text = re.sub(pattern2, '', cleaned_text)
+    #     cleaned_text = re.sub(pattern2, '', cleaned_text)
-        return cleaned_text
+    #     return cleaned_text
    async def get_news_content(self, urls: List[str]) -> List[str]:
        contents = []
@ -97,10 +97,10 @@ class PsycologySpider(HttpxSpider):
                    exclude_external_links=True,
                    session_id=sessionid,
                )
-                t = self.filter_content(result.markdown)
+                # t = self.filter_content(result.markdown)
                # print(t)
-                contents.append(t)
+                contents.append(result.markdown)
-                await asyncio.sleep(2)
+                # await asyncio.sleep(2)
        return contents