Coder Social home page Coder Social logo

haixun's Introduction

haixun

安装

环境要求

安装

使用 composer:

$ composer require wengoooo/haixun

快速开始

建立一个爬虫

require_once "vendor/autoload.php";

use GuzzleHttp\Psr7\Request;
class TheBaseSpider extends \Haixun\Core\Spiders {
    public $maxPage = 1;
    public $currentPage = 1;
    public $userId;

//    public $startUrls = ['http://www.httpbin.org/get', 'http://www.httpbin.org/user-agent'];

    public function startRequests()
    {
        yield new Request("GET", "https://www.domain.com/categories/1735750");
    }

    public function parse(Haixun\Http\Response $response, $index)
    {
        if (sizeof($response->css("#max_page")) > 0) {
            $this->maxPage = (int)$response->css("#max_page")->text();
            $this->currentPage = 1;
            preg_match_all("%(user_[^']+)%", $response->getBodyContents(), $result, PREG_PATTERN_ORDER);
            $this->userId = $result[0][0];
        }

        $uri = new \GuzzleHttp\Psr7\Uri($response->getCurrentUrl());

        while ($this->currentPage++ <= $this->maxPage) {
            yield new Request("GET", sprintf("https://%s/load_items/categories/1735750/%s/%s/0", $uri->getHost(), $this->currentPage, $this->userId));
        }

        foreach ($response->css(".item a[href*=items]")->links() as $link) {
            yield new Request("GET", $link->getUri(), ['meta' => ['callback' => 'parseProduct']]);
        }

    }

    public function parseProduct(Haixun\Http\Response $response, $index) {
        var_dump($response->css("h2.itemTitle")->text());
    }

    public function finish() {}
}

启动爬虫

$crawler = new \Haixun\Core\Crawler(new TheBaseSpider());
$crawler->crawl();

DomCrawler Crawler

实例化

$url = 'https://movie.douban.com/subject/25812712/?from=showing';

$response = file_get_contents($url);
//进行XPath页面数据抽取
$data    = []; //结构化数据存本数组


$crawler = new Crawler();
$crawler->addHtmlContent($response);

查找元素

# xpath
$crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text();
$crawler->filterXPath('//*[@id="content"]/h1/span[1]')->html();

# css
$crawler->filter('#content h1 span')->text();
$crawler->filter('#content h1 span')->html();

遍历元素

$crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) {
    $node->attr("class") # 获取属性
});

获取总数

$crawler->filter(".item a[href*=items]")->count();

遍历所有链接

foreach($crawler->filter(".item a[href*=items]")->links() as $link) {
    echo $link->getUri();
}

haixun's People

Contributors

wengoooooooo avatar ernst-lee avatar wwwenge avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.