Преглед на файлове

📦 小程序贴图抓取脚本

快乐的梦鱼 преди 1 месец
родител
ревизия
a75b5ffeca
променени са 4 файла, в които са добавени 521 реда и са изтрити 0 реда
  1. 68 0
      collect-app-msg.md
  2. 399 0
      collect-app-msg.py
  3. 54 0
      data.json
  4. BIN
      小程序贴图抓取脚本.zip

+ 68 - 0
collect-app-msg.md

@@ -0,0 +1,68 @@
+# collect-app-msg.py 使用说明
+
+用于抓取微信“小程序话题”下的公众号文章(贴图),并以 JSON 结构输出。
+
+## 环境要求
+
+- Python 3.10+(已在 Windows + Python 3.13 环境验证)
+- 网络可访问 `mp.weixin.qq.com` ,不能在常见云服务商静态IP环境下运行,会触发验证码验证,
+请在自己电脑上登陆过自己微信的情况下运行。
+
+## 安装依赖
+
+```bash
+pwsh -NoProfile -Command "python -m pip install -U requests playwright; python -m playwright install chromium"
+```
+
+## 运行
+
+```bash
+pwsh -NoProfile -Command "python ./collect-app-msg.py --topic '#乡源文化挖掘' --out data.json"
+```
+
+常用参数:
+
+- `--topic`:**必填**。话题名称,例如 `#乡源文化挖掘`
+- `--out`:输出文件名,默认 `data.json`
+- `--max-pages`:最多抓取页数,默认 `10`(`<=0` 表示不限制)
+- `--sleep`:每条之间延迟秒数,默认 `0.3`
+
+示例(只抓 1 页):
+
+```bash
+pwsh -NoProfile -Command "python ./collect-app-msg.py --topic '#乡源文化挖掘' --max-pages 1 "
+```
+
+## 输出格式
+
+输出文件:默认为 data.json 在同名目录下,是一个 JSON 数组,每条为文章结构:
+
+```json
+[
+  {
+    "outlinkId": 2247483702,
+    "userId": 0,
+    "url": "https://mp.weixin.qq.com/s?...",
+    "title": "标题",
+    "images": ["https://.../0?wx_fmt=jpeg"],
+    "content": "<a class=\"wx_topic_link\" ...>...</a>\n"
+  }
+]
+```
+
+字段说明:
+
+- `outlinkId`:微信官方文章的 `msgid`(用于去重)
+- `userId`:从乡源小程序传入的用户ID,如果不是从小程序发出来的则为 `0`
+- `images`:图片
+- `content`:详情数据
+
+## 增量抓取与去重规则
+
+脚本会在抓取前读取 `--out` 指定的 JSON:
+
+- **判重键**:优先使用 `outlinkId`;缺失时使用 `url`
+- **跳过详情页请求**:若 `getTopicList` 返回的某条文章已存在于历史 JSON 中,则不会再打开详情页
+- **终止翻页**:如果某一页 `getTopicList` 的所有条目都已存在(本页没有任何新条目),则停止请求下一页
+- **保存顺序**:写回 JSON 时,**本次新抓到的条目会放在最上面**
+- **合并去重**:写回时仍会去重,且 **新数据优先**(同一 `outlinkId/url` 时保留新条目)

+ 399 - 0
collect-app-msg.py

@@ -0,0 +1,399 @@
+"""
+抓取微信“小程序话题”下的公众号文章(贴图)并输出为 JSON。
+
+参考实现来源:
+- server/src/api/WxAppmsgContent.ts
+- server/src/services/content/appmsg/AppMsgService.ts
+
+脚本参数
+
+  --topic:话题名称,例如 #乡源文化挖掘
+  --out:输出文件名,默认 data.json
+
+输出
+
+  JSON 数组,每条为 AppMsg:
+  * outlinkId: int : 微信官方贴图的 ID,可以拿来去重
+  * userId: int : 从小程序传来的用户ID
+  * url: str : 实际跳转的 URL
+  * title: str : 标题
+  * images: List[str] : 封面图
+  * content: str : 内容
+
+用法(PowerShell / pwsh):
+  pwsh -NoProfile -Command "python ./collect-app-msg.py --topic '#乡源文化挖掘' --out data.json"
+
+依赖:
+  python -m pip install -U requests playwright
+  python -m playwright install chromium
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import time
+from dataclasses import dataclass, asdict
+from typing import Any, Dict, Iterable, List, Optional, Tuple, Set
+
+import requests
+
+
+TOPIC_LIST_ENDPOINT = "https://mp.weixin.qq.com/mp/appmsgtopic"
+UA = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/91.0.4472.124 Safari/537.36"
+)
+
+
+@dataclass
+class AppMsg:
+    # 结构同 server/src/models/content/appmsg/AppMsgModel.ts
+    outlinkId: int = 0
+    userId: int = 0
+    url: str = ""
+    title: str = ""
+    images: List[str] = None  # type: ignore[assignment]
+    content: str = ""
+
+    def __post_init__(self) -> None:
+        if self.images is None:
+            self.images = []
+
+
+def _lazy_import_playwright():
+    """
+    延迟导入 Playwright,避免未安装时报错(只有用户选择浏览器抓取才需要)。
+    """
+    try:
+        from playwright.sync_api import sync_playwright  # type: ignore
+
+        return sync_playwright
+    except Exception as e:
+        raise RuntimeError(
+            "Playwright 未安装或不可用。请执行:python -m pip install -U playwright && python -m playwright install chromium"
+        ) from e
+
+def _load_existing_appmsgs(path: str) -> List[Dict[str, Any]]:
+    try:
+        # 兼容:历史文件可能是 utf-8 或 utf-8-sig(带 BOM)
+        try:
+            with open(path, "r", encoding="utf-8-sig") as f:
+                data = json.load(f)
+        except UnicodeError:
+            with open(path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+        if isinstance(data, list):
+            return [x for x in data if isinstance(x, dict)]
+    except FileNotFoundError:
+        return []
+    except json.JSONDecodeError:
+        # 文件存在但不是合法 JSON:当作无历史数据
+        return []
+    return []
+
+
+def _build_seen_keys(existing: List[Dict[str, Any]]) -> Set[str]:
+    """
+    用 outlinkId 优先去重;缺失时用 url 兜底。
+    """
+    seen: Set[str] = set()
+    for it in existing:
+        outlink_id = it.get("outlinkId")
+        if isinstance(outlink_id, int) and outlink_id:
+            seen.add(f"id:{outlink_id}")
+        url = it.get("url")
+        if isinstance(url, str) and url:
+            seen.add(f"url:{url}")
+    return seen
+
+
+def _topic_msg_key(topic_msg: Dict[str, Any]) -> Tuple[Optional[str], Optional[int], Optional[str]]:
+    outlink_id: Optional[int] = None
+    try:
+        outlink_id = int(((topic_msg.get("id") or {}).get("msgid")) or 0) or None
+    except Exception:
+        outlink_id = None
+    url = topic_msg.get("jump_url")
+    if not isinstance(url, str) or not url:
+        url = None
+    key = None
+    if outlink_id:
+        key = f"id:{outlink_id}"
+    elif url:
+        key = f"url:{url}"
+    return key, outlink_id, url
+
+
+def _requests_session() -> requests.Session:
+    s = requests.Session()
+    s.headers.update(
+        {
+            "User-Agent": UA,
+            "Accept": "application/json,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        }
+    )
+    return s
+
+
+def get_topic_list(
+    sess: requests.Session, *, topic: str, paging: str = ""
+) -> Dict[str, Any]:
+    """
+    等价于 WxAppmsgContentApi.getTopicList:
+    GET /mp/appmsgtopic?action=topic_list&topic=...&paging=...&f=json...
+    """
+    params = {
+        "action": "topic_list",
+        "topic": topic,
+        "paging": paging or "",
+        "sort_type": 1,
+        "from": 1,
+        # 下面这些字段在 TS 侧是固定值/空值,保持一致以提高兼容性
+        "from_biz": 3636524509,
+        "from_msgid": 2247483692,
+        "from_itemidx": 1,
+        "appid": "",
+        "silent": 1,
+        "uin": 0,
+        "key": "",
+        "pass_ticket": "",
+        "wxtoken": "",
+        "devicetype": "",
+        "clientversion": "false",
+        "version": "false",
+        "appmsg_token": "",
+        "x5": 0,
+        "f": "json",
+        "user_article_role": 0,
+    }
+    r = sess.get(TOPIC_LIST_ENDPOINT, params=params, timeout=30)
+    r.raise_for_status()
+    data = r.json()
+    # 微信侧常见返回:{ code, message, ... } 或直接业务字段
+    if isinstance(data, dict) and "code" in data and data.get("code") not in (None, 0):
+        raise RuntimeError(f"topic_list failed: code={data.get('code')} message={data.get('message')}")
+    return data
+
+
+def parse_cgi_data_new_from_html(html: str) -> Dict[str, Any]:
+    raise RuntimeError("已移除基于 HTML 的解析方式,请使用 Playwright 读取 window.cgiDataNew")
+
+
+def get_appmsg_detail_via_playwright(url: str, *, timeout_ms: int = 30000) -> Dict[str, Any]:
+    """
+    使用浏览器直接读取 window.cgiDataNew(最接近“页面真实运行态”)。
+    优点:避免 HTML 文本解析/对象字面量兼容问题。
+    """
+    sync_playwright = _lazy_import_playwright()
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent=UA,
+            locale="zh-CN",
+            extra_http_headers={
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            },
+        )
+        page = context.new_page()
+        try:
+            page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
+            data = page.evaluate(
+                """() => {
+                  const d = (globalThis && globalThis.cgiDataNew) ? globalThis.cgiDataNew : (window && window.cgiDataNew);
+                  if (!d) return null;
+                  return JSON.parse(JSON.stringify(d));
+                }"""
+            )
+            # python side validation
+            if not isinstance(data, dict):
+                raise RuntimeError("window.cgiDataNew not found or not an object")
+            return data
+        finally:
+            try:
+                context.close()
+            finally:
+                browser.close()
+
+
+def get_appmsg_detail(sess: requests.Session, url: str) -> Dict[str, Any]:
+    # 已移除 json5 / node 的 HTML 解析逻辑,统一走浏览器读取 window.cgiDataNew
+    return get_appmsg_detail_via_playwright(url)
+
+
+def _extract_user_id_from_detail(detail: Dict[str, Any]) -> int:
+    """
+    复刻 AppMsgService.collectAppMsgContent 中的 userId 推导逻辑。
+    """
+    try:
+        eps = detail.get("ext_publish_source") or {}
+        weapp = eps.get("weapp_info") or {}
+        desc = weapp.get("desc") or ""
+        if isinstance(desc, str) and desc.startswith("亮乡源话题"):
+            m = re.search(r"亮乡源话题(\d+)", desc)
+            if m:
+                return int(m.group(1))
+    except Exception:
+        pass
+    return 0
+
+
+def _extract_images_from_detail(detail: Dict[str, Any]) -> List[str]:
+    pics = detail.get("picture_page_info_list") or []
+    images: List[str] = []
+    if isinstance(pics, list):
+        for it in pics:
+            if isinstance(it, dict):
+                url = it.get("cdn_url")
+                if isinstance(url, str) and url:
+                    images.append(url)
+    return images
+
+
+def build_appmsg(topic_msg: Dict[str, Any], detail: Dict[str, Any]) -> AppMsg:
+    outlink_id = 0
+    try:
+        outlink_id = int(((topic_msg.get("id") or {}).get("msgid")) or 0)
+    except Exception:
+        outlink_id = 0
+
+    url = topic_msg.get("jump_url") or ""
+    if not isinstance(url, str):
+        url = ""
+
+    title = detail.get("title") or ""
+    if not isinstance(title, str):
+        title = ""
+
+    content = detail.get("content_noencode") or ""
+    if not isinstance(content, str):
+        content = ""
+
+    return AppMsg(
+        id=0,
+        outlinkId=outlink_id,
+        userId=_extract_user_id_from_detail(detail),
+        url=url,
+        title=title,
+        images=_extract_images_from_detail(detail),
+        content=content,
+    )
+
+
+def collect_topic_appmsgs(
+    *,
+    topic: str,
+    max_pages: int = 10,
+    sleep_sec: float = 0.3,
+    seen_keys: Optional[Set[str]] = None,
+) -> List[AppMsg]:
+    sess = _requests_session()
+    paging = ""
+    page = 0
+    results: List[AppMsg] = []
+    local_seen: Set[str] = set(seen_keys or set())
+
+    while True:
+        page += 1
+        if max_pages > 0 and page > max_pages:
+            break
+
+        data = get_topic_list(sess, topic=topic, paging=paging)
+        topic_msgs = data.get("topic_msgs") or []
+        if not isinstance(topic_msgs, list):
+            break
+
+        page_has_new = False
+        for item in topic_msgs:
+            if not isinstance(item, dict):
+                continue
+
+            key, outlink_id, jump_url = _topic_msg_key(item)
+            if key and key in local_seen:
+                continue
+            if not jump_url:
+                continue
+
+            # 只有未抓取过的条目才请求详情页
+            detail = get_appmsg_detail(sess, jump_url)
+            appmsg = build_appmsg(item, detail)
+
+            # 标记为已抓取
+            if appmsg.outlinkId:
+                local_seen.add(f"id:{appmsg.outlinkId}")
+            if appmsg.url:
+                local_seen.add(f"url:{appmsg.url}")
+            results.append(appmsg)
+            page_has_new = True
+
+            if sleep_sec > 0:
+                time.sleep(sleep_sec)
+
+        # 如果这一页全部在已有 JSON 中(没有任何新条目),终止后续翻页
+        if not page_has_new:
+            break
+
+        paging = data.get("topic_paging") or ""
+        if not isinstance(paging, str) or not paging:
+            break
+
+        # 如果本页没抓到任何内容,就停止继续翻页
+        if len(topic_msgs) == 0:
+            break
+
+    return results
+
+
+def main(argv: Optional[List[str]] = None) -> int:
+    parser = argparse.ArgumentParser(description="抓取微信话题下的公众号文章(贴图)并输出 JSON")
+    parser.add_argument("--topic", required=True, help="话题名称,例如:#乡源文化挖掘")
+    parser.add_argument("--out", default="data.json", help="输出文件名,默认 data.json")
+    parser.add_argument("--max-pages", type=int, default=10, help="最多抓取页数,默认 10(<=0 表示不限制)")
+    parser.add_argument("--sleep", type=float, default=0.3, help="每条之间的延迟秒数,默认 0.3")
+    args = parser.parse_args(argv)
+
+    existing = _load_existing_appmsgs(args.out)
+    seen = _build_seen_keys(existing)
+
+    new_appmsgs = collect_topic_appmsgs(
+        topic=args.topic,
+        max_pages=args.max_pages,
+        sleep_sec=args.sleep,
+        seen_keys=seen,
+    )
+    new_payload = [asdict(x) for x in new_appmsgs]
+
+    # 保存时:最新抓到的放在最上面;并按 outlinkId/url 去重(新覆盖旧)
+    merged: List[Dict[str, Any]] = []
+    merged_seen: Set[str] = set()
+    for it in new_payload + existing:
+        if not isinstance(it, dict):
+            continue
+        key: Optional[str] = None
+        outlink_id = it.get("outlinkId")
+        url = it.get("url")
+        if isinstance(outlink_id, int) and outlink_id:
+            key = f"id:{outlink_id}"
+        elif isinstance(url, str) and url:
+            key = f"url:{url}"
+        if key and key in merged_seen:
+            continue
+        if key:
+            merged_seen.add(key)
+        merged.append(it)
+
+    # 写出 utf-8-sig(带 BOM),避免部分 Windows 工具按 ANSI/GBK 误判导致“中文乱码”
+    with open(args.out, "w", encoding="utf-8-sig") as f:
+        json.dump(merged, f, ensure_ascii=False, indent=2)
+
+    print(
+        f"OK: topic={args.topic} new={len(new_payload)} total={len(merged)} out={args.out}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 54 - 0
data.json

@@ -0,0 +1,54 @@
+[
+  {
+    "outlinkId": 2247483702,
+    "userId": 414,
+    "url": "https://mp.weixin.qq.com/s?__biz=MzYzNjUyNDUwOQ==&mid=2247483702&idx=1&sn=3707b26797274f1833b2a722f0da0b25&chksm=f0fe4384c789ca92f10881299f99dce7bf37da1fc14139580152d3559e8175c694c7be2cc62e&scene=327#rd",
+    "title": "#乡源文化挖掘",
+    "images": [
+      "https://mmbiz.qpic.cn/mmbiz_jpg/0ibqUERIslOXh5lWhibFEvj1zOTzBEguqe0Q6icqxy6Jl4aNrdXsuFTKnupagYfPonj6ADf6z9iauxKlyevmp7P10UkXvZLk4AVJ9keEvmGZIdE/0?wx_fmt=webp"
+    ],
+    "content": "\n\n<a class=\"wx_topic_link\" data-topic=\"1\" data-recommend=\"0\" style=\"color: #576B95;\">#乡源文化挖掘</a>\n"
+  },
+  {
+    "outlinkId": 2247483697,
+    "userId": 0,
+    "url": "https://mp.weixin.qq.com/s?__biz=MzYzNjUyNDUwOQ==&mid=2247483697&idx=1&sn=e15e1ad0a5c58b593efc2de9b722f746&chksm=f0fe4383c789ca959067d30c0cbeb82b996ab9a67353da575180797d9f5779d65197c541afeb&scene=327#rd",
+    "title": "#乡源文化挖掘",
+    "images": [
+      "https://mmbiz.qpic.cn/mmbiz_png/0ibqUERIslOVBKVNNY9sVvXxzp6eYUna8icM91pojuYGqubicHVvyhIvCe6ibS0icESmRy0THuicCT0ticcnicawKMT2R6Q0kiaCBzp8NjNeicptAW0Wo/0?wx_fmt=png"
+    ],
+    "content": "\n\n<a class=\"wx_topic_link\" data-topic=\"1\" data-recommend=\"0\" style=\"color: #576B95;\">#乡源文化挖掘</a>\n"
+  },
+  {
+    "outlinkId": 2247483687,
+    "userId": 0,
+    "url": "https://mp.weixin.qq.com/s?__biz=MzYzNjUyNDUwOQ==&mid=2247483687&idx=1&sn=d1f5278ec6087b532294e6da7d4a0940&chksm=f0fe4395c789ca83ab6f22edef5a82dd22dd155f303044ce8aca7b264ff4935fb8866aba7f9c&scene=327#rd",
+    "title": "#乡源文化挖掘",
+    "images": [
+      "https://mmbiz.qpic.cn/sz_mmbiz_jpg/0ibqUERIslOW8b0QahenAGSkdjAPTgM0AELHV2mxRDAjeMhhHJGgpEESELeCwwuOA8peAmzd4V1nqdUyRWLLZw4LTj1FjIctM3zMHXkB4DicA/0?wx_fmt=jpeg"
+    ],
+    "content": "<a class=\"wx_topic_link\" data-topic=\"1\" style=\"color: #576B95;\">#乡源文化挖掘</a>\n"
+  },
+  {
+    "outlinkId": 2247483682,
+    "userId": 0,
+    "url": "https://mp.weixin.qq.com/s?__biz=MzYzNjUyNDUwOQ==&mid=2247483682&idx=1&sn=09301714a7ab8ec7ae8bff8225dd5402&chksm=f0fe4390c789ca8658014b829dd497ffa27db0d5d167050bec66f7b41f9d16dfa75ac77af29a&scene=327#rd",
+    "title": "吉安",
+    "images": [
+      "https://mmbiz.qpic.cn/sz_mmbiz_jpg/0ibqUERIslOUbP3Qw3O34KlouibWAbr2ianaC3tBC8xp9Cwbm4c9nxNeYicQajBU4sXGfutwUDfeR80cekb9TEtRF8IGXusoLhwmVtpOs6RSYKM/0?wx_fmt=jpeg",
+      "https://mmbiz.qpic.cn/mmbiz_jpg/0ibqUERIslOWZg3dGwnAhoSt9lh06hibndibCjMqia4q3a8VRZFicWvxBbicAoHib4laMhI8Nf7HJKxQbU57nlCatPORRN0RCmFzewpJrr7mZ0hunc/0?wx_fmt=jpeg",
+      "https://mmbiz.qpic.cn/sz_mmbiz_jpg/0ibqUERIslOX3Q0jxaS0GRqiazBVVcPVoZ8AkEIuwUt6ibMSrDBTcB6cTVXHMYVqHEdeBP4jwGKiazXCZianyhSiavcBC320nDicsODVC4M4v2ibGicQ/0?wx_fmt=jpeg"
+    ],
+    "content": "吉安\n\n<a class=\"wx_topic_link\" data-topic=\"1\" style=\"color: #576B95;\">#乡源文化挖掘</a>\n"
+  },
+  {
+    "outlinkId": 2247483652,
+    "userId": 0,
+    "url": "https://mp.weixin.qq.com/s?__biz=MzMwNjA2OTU5Nw==&mid=2247483652&idx=1&sn=ee51a470ac42cad1ae4c2b7b190bf874&chksm=ed302836da47a120316fbf7135426d96769402de51b34daa349d6531a570417aa5f37d3a33a4&scene=327#rd",
+    "title": "马年大吉",
+    "images": [
+      "https://mmbiz.qpic.cn/sz_mmbiz_jpg/efm4hU59AzDXicdWmOiafDCLhicfOS6TxiaH9F97aS7Lf0g7vx3RPsKiauMeR5ppnQY4MUiauDlUnSZjibvApURWDnpg6l8fC8ZcevIqw66GSAthNg/0?wx_fmt=jpeg"
+    ],
+    "content": "\n\n<a class=\"wx_topic_link\" data-topic=\"1\" style=\"color: #576B95;\">#乡源文化挖掘</a>\n"
+  }
+]

BIN
小程序贴图抓取脚本.zip