葛胖胖的小草屋 一言不合 rm -fr /

php使用selenium进行页面抓取

2021-12-06
cossete

阅读:

PHP

场景

使用过python操作selenium来抓取页面数据后,寻思php是否也一样可以

安装webdriver

#项目地址: https://github.com/php-webdriver/php-webdriver
composer require php-webdriver/webdriver

下载对应google可执行文件

下载的google可执行文件要和本地google版本一致

查看本地google版本信息

#在chrome输入进行查看
chrome://version/

下载所需文件

# 到下面地址下载对应的chrome文件
https://chromedriver.storage.googleapis.com/index.html

启动google可执行文件

./chromedirver --port=4444

实例代码

declare (strict_types=1);


namespace App\Common\Lib\Mp;

use App\Exception\ScanTimeOutException;
use Facebook\WebDriver\Chrome\ChromeOptions;
use Facebook\WebDriver\Remote\DesiredCapabilities;
use Facebook\WebDriver\Remote\RemoteWebDriver;
use Facebook\WebDriver\WebDriverBy;
use Facebook\WebDriver\WebDriverExpectedCondition;

/**
 * 模拟登录
 */
class LoginSimulation
{
    private $driver;
    protected string $loginUrl = "https://mp.weixin.qq.com/";

    /**
     * 初始化webdriver
     */
    public function __construct()
    {
        $host = config('webdriver.host');
        $debug = config('webdriver.debug');
        $options = new ChromeOptions();
        //线上部署需要无界面启动
        if ($debug == false) {
            //无界面启动
            $options->addArguments([
                '--no-sandbox',
                '--headless',
                '--disable-gpu',
                '--disable-dev-shm-usage'
            ]);
        }
        try {
            $capabilities = DesiredCapabilities::chrome();
            $capabilities->setCapability(ChromeOptions::CAPABILITY, $options);
            $driver = RemoteWebDriver::create($host, $capabilities, 5000);
            $this->driver = $driver;
        } catch (\Exception $e) {
            throw new \RuntimeException("webdriver初始化失败");
        }
    }

    /**
     * 登录动作
     * @param string $username
     * @param string $password
     * @return $this
     */
    public function login(string $username, string $password): self
    {
        $driver = $this->driver;
        //登录地址
        $driver->get($this->loginUrl);
        $driver->manage()->deleteAllCookies();
        //使用账号登录
        $driver->findElement(WebDriverBy::linkText("使用帐号登录"))->click();
        //输入账号密码登录
        $driver->findElement(WebDriverBy::name("account"))->sendKeys($username);
        $driver->findElement(WebDriverBy::name("password"))->sendKeys($password);
        $driver->findElement(WebDriverBy::className('btn_login'))->click();
        return $this;
    }

    /**
     * 获取登录的二维码信息
     * @return array
     */
    public function getLoginQrcode(): array
    {
        $driver = $this->driver;
        try {
            //等待页面跳转,判断也账号密码是否正确
            //todo 多次登录失败会出现验证码
            //todo 需要登录失败次数过多保护策略
            $driver->wait(5, 1000)->until(
                WebDriverExpectedCondition::presenceOfElementLocated(WebDriverBy::className("js_qrcode"))
            );
            //获取登录的二维码
            $qrcode = $driver->findElement(WebDriverBy::className("js_qrcode"))->getAttribute("src");
            $words = $driver->findElement(WebDriverBy::className("js_wording"))->getText();
            //将qrcode转为base64图片样式
            $encode = $this->loginQrcodeToBase64($qrcode);
            return [
                'qrcode' => $encode,
                'mp_name' => $words
            ];
        } catch (\Throwable $e) {
            throw new \RuntimeException("'您输入的帐号或者密码不正确'或'或失败次数过多出现验证码'或'网络不可达'");
        }
    }

    /**
     * 将登录的二维码转base64格式
     * @param  string $qrcode 二维码地址
     * @return string
     */
    public function loginQrcodeToBase64(string $qrcode): string
    {
        $driver = $this->driver;
        //这边二维码是需要cookie才能展示,这边获取图片然后将base64后转给前端
        $cookies = $driver->manage()->getCookies();
        $cookieStr = '';
        foreach ($cookies as $cookie) {
            $tmp = $cookie->toArray();
            $cookieStr .= $tmp['name'] . '=' . $tmp['value'] . "; ";
        }

        $curl = curl_init();
        curl_setopt_array($curl, array(
            CURLOPT_URL => $qrcode,
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_ENCODING => '',
            CURLOPT_MAXREDIRS => 10,
            CURLOPT_TIMEOUT => 0,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
            CURLOPT_CUSTOMREQUEST => 'GET',
            CURLOPT_HTTPHEADER => array(
                "cookie:" . $cookieStr
            ),
        ));
        $response = curl_exec($curl);
        curl_close($curl);
        $base64 = chunk_split(base64_encode($response));
        $encode = "data:image/jpg/png/gifScanTimeOutException;base64," . $base64;
        return $encode;
    }

    /**
     * 等待扫码
     * @param int $timeout
     * @return array
     */
    public function waitScan(int $timeout = 5): array
    {
        $driver = $this->driver;
        try {
            $driver->wait(60 * $timeout, 1500)->until(
                WebDriverExpectedCondition::presenceOfElementLocated(WebDriverBy::id("menuBar"))
            );
            $indexUrl = $driver->getCurrentURL(); //需要解析url中的token
            $cookies = $driver->manage()->getCookies();
            $cookieArr = [];
            foreach ($cookies as $cookie) {
                array_push($cookieArr, $cookie->toArray());
            }
            return [
                'url' => $indexUrl,
                'cookie' => $cookieArr
            ];
        } catch (\Throwable $e) {
            throw new ScanTimeOutException("未有扫码或扫码超时");
        }
    }

    /**
     * 关闭浏览器
     */
    public function quit()
    {
        $this->driver->quit();
    }
}

Comments

Content