PHP爬虫 -- 017 实战2 爬取猫眼电影top100

使用IP代理池获取数据, 并打印输出

<?php

require 'vendor/autoload.php';
use QL\QueryList;
$base = "https://maoyan.com/board/4?offset=";

for ($i=0; $i < 10; $i++) { 
    $page = $i*10;
    $url = $base.$page;
    echo $url."\n";
    get_content($url);
}

function get_content($url){
    
    $data = QueryList::html(get_html_source($url))->rules([
        'title'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a','title'],
        'rank'=>['#app > div > div > div.main > dl > dd > i','text'],
        'img'=>['#app > div > div > div.main > dl > dd > a > img.board-img','data-src'],
        'actor'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star','text'],
        'date'=>['#app > div > div > div.main > dl > dd> div > div > div.movie-item-info > p.releasetime','text'],
        'score'=>['#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p','text'],
    ])->queryData();
    var_dump($data);
}
function get_html_source($url)
{
    // 要访问的目标页面
    $targetUrl = $url;
    // 代理服务器
    $proxyServer = "http://http-dyn.abuyun.com:9020";
    // 隧道身份信息
    $proxyUser = "H19D75L76VK89Q8D";
    $proxyPass = "8C17B0A80F475BD8";
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $targetUrl);
    curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    // 设置代理服务器
    curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
    curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
    // 设置隧道验证信息
    curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
    curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    $result = curl_exec($ch);
    //$info = curl_getinfo($ch);
    curl_close($ch);
    return $result;
}
复制代码

爬取数据并生成markdown文档

<?php

require 'vendor/autoload.php';
use QL\QueryList;
$base = "https://maoyan.com/board/4?offset=";

for ($i = 0; $i < 10; $i++) {
    $page = $i * 10;
    $url = $base . $page;
    echo $url . "\n";
    get_content($url);
}

function get_content($url)
{

    $data = QueryList::html(get_html_source($url))->rules([
        'title' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a', 'title'],
        'rank' => ['#app > div > div > div.main > dl > dd > i', 'text'],
        'img' => ['#app > div > div > div.main > dl > dd > a > img.board-img', 'data-src'],
        'actor' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star', 'text'],
        'date' => ['#app > div > div > div.main > dl > dd> div > div > div.movie-item-info > p.releasetime', 'text'],
        'score' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p', 'text'],
    ])->queryData();
    make_markdown($data);
}

function make_markdown($data)
{
    $md_obj = fopen('maoyan.md', 'a+');
    foreach ($data as $key => $value) {
        $title = $value['title'];
        $rank = $value['rank'];
        $img = $value['img'];
        $actor = $value['actor'];
        $date = $value['date'];
        $score = $value['score'];

        fwrite($md_obj,"![]({$img})\n");
        fwrite($md_obj,"## {$rank}-{$title}-{$score}\n");
        fwrite($md_obj,"> {$date}\n");
        fwrite($md_obj,"> {$actor}\n");
        fwrite($md_obj,"\n\n---\n\n");
    }
    fclose($md_obj);
}

function get_html_source($url)
{
    // 要访问的目标页面
    $targetUrl = $url;
    // 代理服务器
    $proxyServer = "http://http-dyn.abuyun.com:9020";
    // 隧道身份信息
    $proxyUser = "H19D75L76VK89Q8D";
    $proxyPass = "8C17B0A80F475BD8";
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $targetUrl);
    curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    // 设置代理服务器
    curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
    curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
    // 设置隧道验证信息
    curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
    curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    $result = curl_exec($ch);
    //$info = curl_getinfo($ch);
    curl_close($ch);
    return $result;
}
复制代码

修改代码, 保存到数据库

先建数据表

CREATE TABLE `movies` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) NOT NULL,
  `rank` varchar(255) NOT NULL,
  `score` varchar(255) NOT NULL,
  `actor` varchar(255) NOT NULL,
  `date` varchar(255) NOT NULL,
  `img` varchar(255) NOT NULL,
  PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;
复制代码

然后把 make_markdown() 函数, 修改成 save_data() 函数

<?php

require 'vendor/autoload.php';
use QL\QueryList;
use Medoo\Medoo;

$base = "https://maoyan.com/board/4?offset=";
$database = new medoo([
    'database_type' => 'mysql',
    'database_name' => 'demo_db',
    'server' => 'localhost',
    'username' => 'root',
    'password' => 'root',
    'charset' => 'utf8'
]);
for ($i = 0; $i < 10; $i++) {
    $page = $i * 10;
    $url = $base . $page;
    echo $url . "\n";
    get_content($url);
}

function get_content($url)
{

    $data = QueryList::html(get_html_source($url))->rules([
        'title' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.name > a', 'title'],
        'rank' => ['#app > div > div > div.main > dl > dd > i', 'text'],
        'img' => ['#app > div > div > div.main > dl > dd > a > img.board-img', 'data-src'],
        'actor' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-info > p.star', 'text'],
        'date' => ['#app > div > div > div.main > dl > dd> div > div > div.movie-item-info > p.releasetime', 'text'],
        'score' => ['#app > div > div > div.main > dl > dd > div > div > div.movie-item-number.score-num > p', 'text'],
    ])->queryData();
    save_data($data);
    sleep(2);
}

function save_data($data)
{
    
    global $database;
    // 插入数据示例
    $database->insert('movies', $data);

    
}

function get_html_source($url)
{
    // 要访问的目标页面
    $targetUrl = $url;
    // 代理服务器
    $proxyServer = "http://http-dyn.abuyun.com:9020";
    // 隧道身份信息
    $proxyUser = "H19D75L76VK89Q8D";
    $proxyPass = "8C17B0A80F475BD8";
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $targetUrl);
    curl_setopt($ch, CURLOPT_HTTPPROXYTUNNEL, false);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
    // 设置代理服务器
    curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
    curl_setopt($ch, CURLOPT_PROXY, $proxyServer);
    // 设置隧道验证信息
    curl_setopt($ch, CURLOPT_PROXYAUTH, CURLAUTH_BASIC);
    curl_setopt($ch, CURLOPT_PROXYUSERPWD, "{$proxyUser}:{$proxyPass}");
    curl_setopt($ch, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727;)");
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 3);
    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    $result = curl_exec($ch);
    //$info = curl_getinfo($ch);
    curl_close($ch);
    return $result;
}
复制代码
我来评几句
登录后评论

已发表评论数()

相关站点

+订阅
热门文章