There is such a scene,I have a number of URLs, want to open the bulk of parallel, but

<a class="user-mention notranslate" data-hovercard-type="user" data-hovercard-url="/us

about CONCURRENCY_PAGE about puppeteer-cluster HOT 8 CLOSED

thomasdondorf commented on May 18, 2024

about CONCURRENCY_PAGE

from puppeteer-cluster.

Comments (8)

thomasdondorf commented on May 18, 2024

I guess you are calling cluster.queue with invalid URLs?

If you want to use skipDuplicateUrls or sameDomainDelay, you need to either provide the URL as string or put the URL into a url property. Then means you have to call the queue function like this:

cluster.queue('http://yoururl.tld/path');

or like this:

cluster.queue({ url: 'http://yoururl.tld/path' });

from puppeteer-cluster.

kanxue660 commented on May 18, 2024

my code

db_result = await fetch60();//fetch 60 urls from db
for (let db_row of db_result) {//loop
await cluster.queue(db_row['link_url']);
};

from puppeteer-cluster.

thomasdondorf commented on May 18, 2024

Is the actual URL given to cluster.queue with protocol (http(s)://test.tld/...) or are you only given a part of the URL (test.tld/...)? The latter will not work, the first one should work.

from puppeteer-cluster.

kanxue660 commented on May 18, 2024

The current settings, the actual running situation is like this, while opening up 10 URLs,I want the result because these 10 URLs are under the same domain name, so each has to have a delay,Can be opened at the same time 10 tab, but the URL can not be entered at the same time, to increase the delay, otherwise the target site was judged as a robot,I try to add a delay code to the top of the task code, or I can open it at the same time.

from puppeteer-cluster.

thomasdondorf commented on May 18, 2024

I understand your scenario and the library supports it.

Please either answer my questions or provide your source code.

from puppeteer-cluster.

kanxue660 commented on May 18, 2024

const { Cluster } = require('puppeteer-cluster');

//规则存放的根目录
const module_path = process.env.my_nodemodules;
//--日志
const logger = require(${module_path}/newLogger.js);
//公用函数库
const { delayAsync } = require(${module_path}/my_common_func.js);

///////////////////////引入数据库////////////////////////////////////
const table_models = require('./table_model');
const db_models = require(${module_path}/mongodb_model);

const mongoose = require('mongoose');
const DB_URL = 'mongodb://localhost:27017/weburl';
const db = mongoose.createConnection(DB_URL, { useNewUrlParser: true });
const scm_list = new mongoose.Schema({ link_url: String, is_deal: Boolean });
const list_model = db.model('t_list', scm_list);

db.on('connected', function () { console.log('Mongodb 链接成功 ' + DB_URL); });
db.on('error', function (err) { console.log('Mongodb 链接失败: ' + err); });
db.on('disconnected', function () { console.log('Mongodb 链接断开'); });
//__可变参数部分
let db_result = [];
main();//主函数

//-------------------函数定义---------------------
//取待采集的url,10分钟运行一次，一次取60条
async function fetch60() {
let promise_me = new Promise(function (resolve, reject) { // 异步处理
list_model
.find({})
.where('is_deal').equals(false)
.limit(60)
.select('link_url')
.exec(function (err, data) {
if (err) {
reject(查找待采集数据失败：${err});
} else {
resolve(data);
}
});
});
return promise_me;

};

async function main() {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 5,
retryLimit: 5,//失败重试5次
retryDelay: 2000,//重试间隔2秒
sameDomainDelay:30*1000,//同一域名下，延时10秒打开，貌似没用
skipDuplicateUrls: true,//跳过重复url
workerCreationDelay: 500,//标签打开延时
puppeteerOptions: {
headless: false,
ignoreHTTPSErrors: true,
slowMo: 250,//延时
defaultViewport: { width: 1440, height: 900 }
}
});
cluster.on('taskerror', (err, data) => {
console.log(采集任务异常 ${data}: ${err.message});
});

db_result = await fetch60();
for (let db_row of db_result) {
    await cluster.queue(db_row['link_url']);//example https://www.lagou.com/jobs/xxxxx.html
};


cluster.task(getHtmlSorce);

await cluster.idle();
await cluster.close();
await db.close();

}

async function getHtmlSorce({ page, data: url }) {
await page.goto(url, { waitUntil: 'domcontentloaded' });
const contents = await page.content();
////////////////////////////////入库
console.log(contents);
//////////////////////////
}

from puppeteer-cluster.

JohnDotOwl commented on May 18, 2024

@kanxue660 If you solved it, perhaps share the solution so others could learn? Haha

from puppeteer-cluster.

kanxue660 commented on May 18, 2024

@Rainbowhat The author has fixed this problem.

from puppeteer-cluster.

about CONCURRENCY_PAGE about puppeteer-cluster HOT 8 CLOSED

Comments (8)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent