第02章:浏览器自动化——让AI替你操控网页

第02章:浏览器自动化——让AI替你操控网页


2.1 什么场景需要浏览器自动化

你每天在网页上做的重复性操作,都是浏览器自动化最好的应用场景:

✅ 每天填出差报销单(报销系统 → 填表 → 提交)
✅ 每天查询竞品价格(竞品官网 → 复制数据 → 填入表格)
✅ 批量给客户发邮件(表单页 → 填邮箱 → 发送)
✅ 定期抓取招聘信息(招聘平台 → 筛选 → 记录)
✅ 自动填写新员工入职信息(HR系统 → 逐项录入)

这一章,用两个最常见的例子让你掌握核心逻辑:自动填表批量数据抓取


2.2 核心工具:Playwright

OpenClaw 的浏览器自动化基于 Playwright(微软开发的自动化框架),支持 Chrome、Edge、Firefox。

安装:

pip install playwright
playwright install chromium

启动浏览器

from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch(headless=False)  # False=可视化,True=后台
    page = browser.new_page()
    page.goto("https://www.example.com")
    print(page.title())
    browser.close()

2.3 场景一:自动填写网页表单

目标:在钉钉/飞书审批系统里自动填报销单

from playwright.sync_api import sync_playwright

def auto_fill_reimbursement(items):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        # 1. 打开报销系统(飞书审批等)
        page.goto("https://internal.feishu.cn/approval/reimburse/new")
        page.wait_for_load_state("networkidle")

        # 2. 填写基本信息
        page.fill('input[name="title"]', f"差旅报销 {items['date']}")
        page.select_option('select[name="department"]', items['department'])

        # 3. 填写明细(自动添加多行)
        for i, item in enumerate(items["details"]):
            # 点击"添加一行"按钮
            page.click('button:has-text("添加明细")')

            # 定位到当前行(最后一行)
            rows = page.locator('.reimburse-row')
            last_row = rows.last

            last_row.locator('input[name="date"]').fill(item["date"])
            last_row.locator('input[name="amount"]').fill(str(item["amount"]))
            last_row.locator('input[name="reason"]').fill(item["reason"])
            last_row.locator('select[name="category"]').select_option(item["category"])

        # 4. 上传票据(直接传文件路径)
        page.set_input_files('input[type="file"]', items["receipt_path"])

        # 5. 提交
        page.click('button:has-text("提交审批")')
        page.wait_for_url("**/success**")

        print(f"✅ 报销单已提交:{items['date']} 合计 ¥{sum(i['amount'] for i in items['details'])}")

        browser.close()

# 调用示例
auto_fill_reimbursement({
    "date": "2026-04-15",
    "department": "产品部",
    "details": [
        {"date": "2026-04-10", "amount": 45, "reason": "客户拜访出租车费", "category": "交通"},
        {"date": "2026-04-10", "amount": 128, "reason": "客户午餐", "category": "餐饮"},
        {"date": "2026-04-12", "amount": 380, "reason": "酒店住宿", "category": "住宿"},
    ],
    "receipt_path": "C:/receipts/2026-04-10.pdf"
})

2.4 场景二:批量抓取网页数据

目标:抓取竞品官网所有产品的价格和规格

from playwright.sync_api import sync_playwright
import json, time

def scrape_competitor_products(url, max_pages=5):
    results = []

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()

        page.goto(url)
        page.wait_for_load_state("networkidle")

        for current_page in range(1, max_pages + 1):
            print(f"正在抓取第 {current_page} 页...")

            # 等待商品列表加载
            page.wait_for_selector('.product-card', timeout=10000)

            # 提取当前页所有商品
            products = page.query_selector_all('.product-card')
            for product in products:
                name = product.query_selector('.product-name').inner_text()
                price_text = product.query_selector('.product-price').inner_text()
                # 清洗价格:去掉¥符号和逗号
                price = float(price_text.replace("¥", "").replace(",", ""))
                rating = product.query_selector('.rating-score').inner_text()

                results.append({
                    "name": name,
                    "price": price,
                    "rating": rating,
                    "page": current_page,
                    "scraped_at": time.strftime("%Y-%m-%d %H:%M")
                })

            # 点击下一页(如果存在)
            next_btn = page.query_selector('button:has-text("下一页")')
            if next_btn and current_page < max_pages:
                next_btn.click()
                page.wait_for_load_state("networkidle")
                time.sleep(1)  # 礼貌爬取,不给对方服务器压力
            else:
                break

        browser.close()

    return results

# 保存到文件
data = scrape_competitor_products("https://www.example-competitor.com/products")
with open("competitor_prices.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ 抓取完成,共 {len(data)} 个商品")

2.5 反爬应对:让浏览器看起来像真人

网站会检测自动化工具并封IP。这里是几个最有效的反检测策略:

from playwright.sync_api import sync_playwright

def human_like_browser():
    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            args=[
                "--disable-blink-features=AutomationControlled",  # 隐藏webdriver特征
                "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            ]
        )

        context = browser.new_context(
            viewport={"width": 1920, "height": 1080},
            locale="zh-CN",
            timezone_id="Asia/Shanghai",
            extra_http_headers={
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            }
        )

        page = context.new_page()

        # 模拟真人操作轨迹
        page.mouse.move(500, 300)  # 移动鼠标
        page.mouse.wheel(0, 300)   # 滚动页面
        page.wait_for_timeout(1000)  # 停顿一下

        return page

2.6 OpenClaw 集成:让对话触发浏览器操作

OpenClaw 支持在对话中直接触发 Playwright 脚本:

# openclaw 个人助手的 skill 配置示例
SKILL_AUTOMATION = {
    "name": "browser-automation",
    "triggers": ["填表", "填报销", "抓取", "爬", "自动提交"],
    "action": "run_script",
    "script": "scripts/auto_form_fill.py",
    "params_schema": {
        "form_type": "string",
        "data": "object"
    }
}

在 OpenClaw 的 SOUL.md 或配置里预设好脚本路径后,你只需要说:

“帮我填一下今天的出差报销,要填4行数据”

OpenClaw 就会自动读取你提供的信息,调用对应的脚本执行。


落地动作

  1. 安装 Playwright(pip install playwright && playwright install chromium
  2. 用第一个基础脚本(启动浏览器、打开网页、打印标题)跑通流程
  3. 找一个你每天重复填写的表单,记录下来字段
  4. 写出你自己的第一个自动填表脚本
  5. 设置一个每周定时执行的爬虫任务(竞品监控/价格追踪等)