refactor: 优化代码

2023-07-29 15:35:40 +08:00 · 2023-07-29 15:35:40 +08:00 · 4ff2cf8661
parent febbb133d7
commit 4ff2cf8661
17 changed files with 133 additions and 140 deletions
--- a/base/base_crawler.py
+++ b/base/base_crawler.py
@ -1,9 +1,11 @@
 from abc import ABC, abstractmethod
 from base.proxy_account_pool import AccountPool
 class AbstractCrawler(ABC):
    @abstractmethod
-    def init_config(self, **kwargs):
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool):
        pass
    @abstractmethod
@ -11,7 +13,7 @@ class AbstractCrawler(ABC):
        pass
    @abstractmethod
-    async def search_posts(self):
+    async def search(self):
        pass
--- a/base/proxy_account_pool.py
+++ b/base/proxy_account_pool.py
@ -1,4 +1,4 @@
-from typing import Tuple, Optional, List, Set
+from typing import List, Optional, Set, Tuple
 import config
--- a/config/init.py
+++ b/config/init.py
@ -1,3 +1,3 @@
 from .base_config import *
 from .account_config import *
 from .base_config import *
 from .db_config import *
--- a/config/base_config.py
+++ b/config/base_config.py
@ -19,8 +19,8 @@ SAVE_LOGIN_STATE = True
 # save user data dir
 USER_DATA_DIR = "%s_user_data_dir"  # %s will be replaced by platform name
-# max page num
+# crawler max notes count
-MAX_PAGE_NUM = 20
+CRAWLER_MAX_NOTES_COUNT = 20
 # max concurrency num
 MAX_CONCURRENCY_NUM = 10
--- a/db.py
+++ b/db.py
@ -1,8 +1,6 @@
-from tortoise import Tortoise
+from tortoise import Tortoise, run_async
 from tortoise import run_async
 from config.db_config import *
 from tools import utils
--- a/main.py
+++ b/main.py
@ -1,9 +1,9 @@
 import sys
 import asyncio
 import argparse
 import asyncio
 import sys
 import db
 import config
 import db
 from base import proxy_account_pool
 from media_platform.douyin import DouYinCrawler
 from media_platform.xhs import XiaoHongShuCrawler
@ -17,14 +17,16 @@ class CrawlerFactory:
        elif platform == "dy":
            return DouYinCrawler()
        else:
-            raise ValueError("Invalid Media Platform Currently only supported xhs or douyin ...")
+            raise ValueError("Invalid Media Platform Currently only supported xhs or dy ...")
 async def main():
    # define command line params ...
    parser = argparse.ArgumentParser(description='Media crawler program.')
-    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
+    parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)', choices=["xhs", "dy"],
-    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.LOGIN_TYPE)
+                        default=config.PLATFORM)
    parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
                        choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
    # init account pool
    account_pool = proxy_account_pool.create_account_pool()
@ -34,9 +36,10 @@ async def main():
        await db.init_db()
    args = parser.parse_args()
-    crawler = CrawlerFactory().create_crawler(platform=args.platform)
+    crawler = CrawlerFactory.create_crawler(platform=args.platform)
    crawler.init_config(
-        command_args=args,
+        platform=args.platform,
        login_type=args.lt,
        account_pool=account_pool
    )
    await crawler.start()
@ -44,6 +47,7 @@ async def main():
 if __name__ == '__main__':
    try:
-        asyncio.run(main())
+        # asyncio.run(main())
        asyncio.get_event_loop().run_until_complete(main())
    except KeyboardInterrupt:
        sys.exit()
--- a/media_platform/douyin/client.py
+++ b/media_platform/douyin/client.py
@ -1,17 +1,17 @@
 import copy
 import asyncio
-from typing import Optional, Dict, Callable
+import copy
 import httpx
 import execjs
 import urllib.parse
-from playwright.async_api import Page
+from typing import Callable, Dict, Optional
-from playwright.async_api import BrowserContext
+
 import execjs
 import httpx
 from playwright.async_api import BrowserContext, Page
 from .field import *
 from .exception import *
 from tools import utils
 from .exception import *
 from .field import *
 class DOUYINClient:
    def __init__(
--- a/media_platform/douyin/core.py
+++ b/media_platform/douyin/core.py
@ -1,38 +1,38 @@
 import os
 import asyncio
 import os
 from asyncio import Task
-from argparse import Namespace
+from typing import Dict, List, Optional, Tuple
 from typing import Optional, List, Dict, Tuple
-from playwright.async_api import async_playwright
+from playwright.async_api import (BrowserContext, BrowserType, Page,
-from playwright.async_api import BrowserType
+                                  async_playwright)
 from playwright.async_api import BrowserContext
 from playwright.async_api import Page
 import config
 from tools import utils
 from .client import DOUYINClient
 from .exception import DataFetchError
 from .login import DouYinLogin
 from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import douyin
 from tools import utils
 from .client import DOUYINClient
 from .exception import DataFetchError
 from .login import DouYinLogin
 class DouYinCrawler(AbstractCrawler):
    platform: str
    login_type: str
    context_page: Page
    dy_client: DOUYINClient
    account_pool: AccountPool
    browser_context: BrowserContext
    def __init__(self) -> None:
        self.browser_context: Optional[BrowserContext] = None  # type: ignore
        self.context_page: Optional[Page] = None  # type: ignore
        self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"  # fixed
        self.index_url = "https://www.douyin.com"
        self.command_args: Optional[Namespace] = None  # type: ignore
        self.account_pool: Optional[AccountPool] = None  # type: ignore
-    def init_config(self, **kwargs):
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
-        for key, value in kwargs.items():
+        self.platform = platform
-            setattr(self, key, value)
+        self.login_type = login_type
        self.account_pool = account_pool
    async def start(self) -> None:
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@ -53,7 +53,7 @@ class DouYinCrawler(AbstractCrawler):
            self.dy_client = await self.create_douyin_client(httpx_proxy)
            if not await self.dy_client.ping(browser_context=self.browser_context):
                login_obj = DouYinLogin(
-                    login_type=self.command_args.lt,  # type: ignore
+                    login_type=self.login_type,
                    login_phone=account_phone,
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -63,25 +63,25 @@ class DouYinCrawler(AbstractCrawler):
                await self.dy_client.update_cookies(browser_context=self.browser_context)
            # search_posts
-            await self.search_posts()
+            await self.search()
            utils.logger.info("Douyin Crawler finished ...")
-    async def search_posts(self) -> None:
+    async def search(self) -> None:
        utils.logger.info("Begin search douyin keywords")
        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"Current keyword: {keyword}")
            aweme_list: List[str] = []
-            max_note_len = config.MAX_PAGE_NUM
+            dy_limite_count = 10  # douyin fixed limit page 10
            page = 0
-            while max_note_len > 0:
+            while (page + 1) * dy_limite_count <= config.CRAWLER_MAX_NOTES_COUNT:
                try:
-                    posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * 10)
+                    posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword,
                                                                            offset=page * dy_limite_count)
                except DataFetchError:
                    utils.logger.error(f"search douyin keyword: {keyword} failed")
                    break
                page += 1
                max_note_len -= 10
                for post_item in posts_res.get("data"):
                    try:
                        aweme_info: Dict = post_item.get("aweme_info") or \
@ -93,15 +93,15 @@ class DouYinCrawler(AbstractCrawler):
            utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
            await self.batch_get_note_comments(aweme_list)
-    async def batch_get_note_comments(self, aweme_list: List[str]):
+    async def batch_get_note_comments(self, aweme_list: List[str]) -> None:
        task_list: List[Task] = []
-        _semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        for aweme_id in aweme_list:
-            task = asyncio.create_task(self.get_comments(aweme_id, _semaphore), name=aweme_id)
+            task = asyncio.create_task(self.get_comments(aweme_id, semaphore), name=aweme_id)
            task_list.append(task)
        await asyncio.wait(task_list)
-    async def get_comments(self, aweme_id: str, semaphore: "asyncio.Semaphore"):
+    async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> None:
        async with semaphore:
            try:
                await self.dy_client.get_aweme_all_comments(
@ -155,7 +155,7 @@ class DouYinCrawler(AbstractCrawler):
        """Launch browser and create browser context"""
        if config.SAVE_LOGIN_STATE:
            user_data_dir = os.path.join(os.getcwd(), "browser_data",
-                                         config.USER_DATA_DIR % self.command_args.platform)  # type: ignore
+                                         config.USER_DATA_DIR % self.platform)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
@ -173,7 +173,7 @@ class DouYinCrawler(AbstractCrawler):
            )
            return browser_context
-    async def close(self):
+    async def close(self) -> None:
        """Close browser context"""
        await self.browser_context.close()
        utils.logger.info("Browser context closed ...")
--- a/media_platform/douyin/login.py
+++ b/media_platform/douyin/login.py
@ -1,22 +1,17 @@
 import sys
 import asyncio
 import functools
 import sys
 from typing import Optional
 import aioredis
-from tenacity import (
+from playwright.async_api import BrowserContext, Page
-    retry,
+from playwright.async_api import TimeoutError as PlaywrightTimeoutError
-    stop_after_attempt,
+from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
-    wait_fixed,
+                      wait_fixed)
    retry_if_result,
    RetryError
 )
 from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
 from playwright.async_api import BrowserContext
 import config
 from tools import utils
 from base.base_crawler import AbstractLogin
 from tools import utils
 class DouYinLogin(AbstractLogin):
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@ -1,16 +1,16 @@
 import json
 import asyncio
-from typing import Optional, Dict
+import json
 from typing import Dict, Optional
 import httpx
-from playwright.async_api import Page
+from playwright.async_api import BrowserContext, Page
 from playwright.async_api import BrowserContext
 from .help import sign, get_search_id
 from .field import SearchSortType, SearchNoteType
 from .exception import DataFetchError, IPBlockError
 from tools import utils
 from .exception import DataFetchError, IPBlockError
 from .field import SearchNoteType, SearchSortType
 from .help import get_search_id, sign
 class XHSClient:
    def __init__(
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@ -1,41 +1,41 @@
 import asyncio
 import os
 import random
 import asyncio
 from asyncio import Task
-from typing import Optional, List, Dict, Tuple
+from typing import Dict, List, Optional, Tuple
 from argparse import Namespace
-from playwright.async_api import Page
+from playwright.async_api import (BrowserContext, BrowserType, Page,
-from playwright.async_api import BrowserContext
+                                  async_playwright)
 from playwright.async_api import async_playwright
 from playwright.async_api import BrowserType
 import config
 from tools import utils
 from .exception import *
 from .login import XHSLogin
 from .client import XHSClient
 from models import xiaohongshu as xhs_model
 from base.base_crawler import AbstractCrawler
 from base.proxy_account_pool import AccountPool
 from models import xiaohongshu as xhs_model
 from tools import utils
 from .client import XHSClient
 from .exception import DataFetchError
 from .login import XHSLogin
 class XiaoHongShuCrawler(AbstractCrawler):
    platform: str
    login_type: str
    context_page: Page
    browser_context: BrowserContext
    xhs_client: XHSClient
    account_pool: AccountPool
    browser_context: BrowserContext
-    def __init__(self):
+    def __init__(self) -> None:
        self.index_url = "https://www.xiaohongshu.com"
        self.command_args: Optional[Namespace] = None # type: ignore
        self.user_agent = utils.get_user_agent()
-    def init_config(self, **kwargs):
+    def init_config(self, platform: str, login_type: str, account_pool: AccountPool) -> None:
-        for key, value in kwargs.items():
+        self.platform = platform
-            setattr(self, key, value)
+        self.login_type = login_type
        self.account_pool = account_pool
-    async def start(self):
+    async def start(self) -> None:
        account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
        async with async_playwright() as playwright:
            # Launch a browser context.
@ -62,7 +62,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
            self.xhs_client = await self.create_xhs_client(httpx_proxy)
            if not await self.xhs_client.ping():
                login_obj = XHSLogin(
-                    login_type=self.command_args.lt,
+                    login_type=self.login_type,
                    login_phone=account_phone,
                    browser_context=self.browser_context,
                    context_page=self.context_page,
@ -72,28 +72,27 @@ class XiaoHongShuCrawler(AbstractCrawler):
                await self.xhs_client.update_cookies(browser_context=self.browser_context)
            # Search for notes and retrieve their comment information.
-            await self.search_posts()
+            await self.search()
            utils.logger.info("Xhs Crawler finished ...")
-    async def search_posts(self) -> None:
+    async def search(self) -> None:
        """Search for notes and retrieve their comment information."""
        utils.logger.info("Begin search xiaohongshu keywords")
-
+        xhs_limit_count = 20  # xhs limit page fixed value
        for keyword in config.KEYWORDS.split(","):
            utils.logger.info(f"Current keyword: {keyword}")
            max_note_len = config.MAX_PAGE_NUM
            page = 1
-            while max_note_len > 0:
+            while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                note_id_list: List[str] = []
-                posts_res = await self.xhs_client.get_note_by_keyword(
+                notes_res = await self.xhs_client.get_note_by_keyword(
                    keyword=keyword,
                    page=page,
                )
-                _semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+                semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                task_list = [
-                    self.get_note_detail(post_item.get("id"), _semaphore)
+                    self.get_note_detail(post_item.get("id"), semaphore)
-                    for post_item in posts_res.get("items", {})
+                    for post_item in notes_res.get("items", {})
                ]
                note_details = await asyncio.gather(*task_list)
                for note_detail in note_details:
@ -101,11 +100,10 @@ class XiaoHongShuCrawler(AbstractCrawler):
                        await xhs_model.update_xhs_note(note_detail)
                        note_id_list.append(note_detail.get("note_id"))
                page += 1
                max_note_len -= 20
                utils.logger.info(f"Note details: {note_details}")
                await self.batch_get_note_comments(note_id_list)
-    async def get_note_detail(self, note_id: str, semaphore: "asyncio.Semaphore") -> Optional[Dict]:
+    async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
        """Get note detail"""
        async with semaphore:
            try:
@ -117,14 +115,14 @@ class XiaoHongShuCrawler(AbstractCrawler):
    async def batch_get_note_comments(self, note_list: List[str]):
        """Batch get note comments"""
        utils.logger.info(f"Begin batch get note comments, note list: {note_list}")
-        _semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list: List[Task] = []
        for note_id in note_list:
-            task = asyncio.create_task(self.get_comments(note_id, _semaphore), name=note_id)
+            task = asyncio.create_task(self.get_comments(note_id, semaphore), name=note_id)
            task_list.append(task)
        await asyncio.gather(*task_list)
-    async def get_comments(self, note_id: str, semaphore: "asyncio.Semaphore"):
+    async def get_comments(self, note_id: str, semaphore: asyncio.Semaphore):
        """Get note comments"""
        async with semaphore:
            utils.logger.info(f"Begin get note id comments {note_id}")
@ -147,7 +145,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
        httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
        return phone, playwright_proxy, httpx_proxy
-    async def create_xhs_client(self, httpx_proxy: str) -> XHSClient:
+    async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
        """Create xhs client"""
        utils.logger.info("Begin create xiaohongshu API client ...")
        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
@ -177,18 +175,19 @@ class XiaoHongShuCrawler(AbstractCrawler):
        if config.SAVE_LOGIN_STATE:
            # feat issue #14
            # we will save login state to avoid login every time
-            user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform) # type: ignore
+            user_data_dir = os.path.join(os.getcwd(), "browser_data",
                                         config.USER_DATA_DIR % self.platform)  # type: ignore
            browser_context = await chromium.launch_persistent_context(
                user_data_dir=user_data_dir,
                accept_downloads=True,
                headless=headless,
-                proxy=playwright_proxy, # type: ignore
+                proxy=playwright_proxy,  # type: ignore
                viewport={"width": 1920, "height": 1080},
                user_agent=user_agent
            )
            return browser_context
        else:
-            browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore
+            browser = await chromium.launch(headless=headless, proxy=playwright_proxy)  # type: ignore
            browser_context = await browser.new_context(
                viewport={"width": 1920, "height": 1080},
                user_agent=user_agent
--- a/media_platform/xhs/login.py
+++ b/media_platform/xhs/login.py
@ -1,21 +1,16 @@
 import sys
 import asyncio
 import functools
 import sys
 from typing import Optional
 import aioredis
-from tenacity import (
+from playwright.async_api import BrowserContext, Page
-    retry,
+from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt,
-    stop_after_attempt,
+                      wait_fixed)
    wait_fixed,
    retry_if_result,
    RetryError
 )
 from playwright.async_api import Page
 from playwright.async_api import BrowserContext
 import config
 from tools import utils
 from base.base_crawler import AbstractLogin
 from tools import utils
 class XHSLogin(AbstractLogin):
@ -24,7 +19,7 @@ class XHSLogin(AbstractLogin):
                 login_type: str,
                 browser_context: BrowserContext,
                 context_page: Page,
-                 login_phone: str = "",
+                 login_phone: Optional[str] = "",
                 cookie_str: str = ""
                 ):
        self.login_type = login_type
--- a/models/douyin.py
+++ b/models/douyin.py
@ -1,8 +1,8 @@
 import json
 from typing import Dict, List
 from tortoise.models import Model
 from tortoise import fields
 from tortoise.models import Model
 import config
 from tools import utils
--- a/models/xiaohongshu.py
+++ b/models/xiaohongshu.py
@ -1,7 +1,7 @@
-from typing import List, Dict
+from typing import Dict, List
 from tortoise.models import Model
 from tortoise import fields
 from tortoise.models import Model
 import config
 from tools import utils
@ -65,7 +65,7 @@ async def update_xhs_note(note_item: Dict):
    local_db_item = {
        "note_id": note_item.get("note_id"),
        "type": note_item.get("type"),
-        "title": note_item.get("title"),
+        "title": note_item.get("title") or note_item.get("desc", ""),
        "desc": note_item.get("desc", ""),
        "time": note_item.get("time"),
        "last_update_time": note_item.get("last_update_time", 0),
--- a/recv_sms_notification.py
+++ b/recv_sms_notification.py
@ -1,7 +1,7 @@
 # Start an HTTP server to receive SMS forwarding notifications and store them in Redis.
 import re
 import json
 import asyncio
 import json
 import re
 from typing import List
 import aioredis
--- a/tools/easing.py
+++ b/tools/easing.py
@ -2,10 +2,11 @@
 # -*- coding: utf-8 -*-
 # copy from https://github.com/aneasystone/selenium-test/blob/master/12-slider-captcha.py
 # thanks to aneasystone for his great work
 import numpy as np
 import math
 from typing import List, Tuple
 import numpy as np
 # https://github.com/gdsmith/jquery.easing/blob/master/jquery.easing.js
 def ease_in_quad(x):
--- a/tools/utils.py
+++ b/tools/utils.py
@ -1,19 +1,18 @@
 import re
 import os
 import time
 import random
 import base64
 import logging
 import os
 import random
 import re
 import time
 from io import BytesIO
 from typing import Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 from typing import Optional, Dict, List, Tuple
 import cv2
 import httpx
 import numpy as np
 from PIL import Image, ImageDraw
-from playwright.async_api import Cookie
+from playwright.async_api import Cookie, Page
 from playwright.async_api import Page
 async def find_login_qrcode(page: Page, selector: str) -> str:
`@ -1,4 +1,4 @@`
	`from typing import Tuple, Optional, List, Set`	`from typing import List, Optional, Set, Tuple`

	`import config`	`import config`