Hi @Insutanto
Great! Your work is really impressive. However, I would like to add some suggestions.
First of all, I wanna open the front console of RabbitMQ(http://127.0.0.1:15672), but it didn't work.
I fixed it by navigating to the RabbitMQ configuration directory, then install web rabbitmq_management and refresh the front console of RabbitMQ. The commands are:
docker exec -it <rabbitmq-container-id> /bin/bash
cd /etc/rabbitmq/
rabbitmq-plugins enable rabbitmq_management
Secondly, delete the key from redis. The commands are:
docker exec -it <redis-container-id> /bin/bash
redis-cli
keys *
del <key_name>
Thirdly, I added the images and files download functions for rabbitmq example common. The code is:
PATH: examples/rabbitmq_example/simple_example/settings.py
ITEM_PIPELINES = {
'simple_example.pipelines.SimpleExamplePipeline': 201,
'scrapy_distributed.pipelines.amqp.RabbitPipeline': 200,
'simple_example.pipelines.ImagePipeline': 202,
'simple_example.pipelines.MyFilesPipeline': 203,
}
FILES_STORE = './test_data/example_common/files_dir'
IMAGES_STORE = './test_data/example_common/images_dir'
PATH: examples/rabbitmq_example/simple_example/pipelines.py
from scrapy.pipelines.images import ImagesPipeline
from scrapy.pipelines.files import FilesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url, meta={'item': item, 'index': item['image_urls'].index(image_url)})
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item']
image_guid = request.url.split('/')[-1]
filename = './{}/{}/{}'.format(item["url"].replace("/", "_"), item['title'], image_guid)
return filename
class MyFilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
for file_url in item['file_urls']:
yield Request(file_url, meta={'item': item, 'index': item['file_urls'].index(file_url)})
def item_completed(self, results, item, info):
file_paths = [x['path'] for ok, x in results if ok]
if not file_paths:
raise DropItem("Item contains no files")
return item
def file_path(self, request, response=None, info=None):
item = request.meta['item']
file_guid = request.url.split('/')[-1]
filename = './{}/{}/{}'.format(item["url"].replace("/", "_"), item['title'], file_guid)
return filename
PATH: examples/rabbitmq_example/simple_example/items.py
class CommonExampleItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
url = scrapy.Field()
content = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
PATH: examples/rabbitmq_example/simple_example/spiders/example.py
def parse(self, response):
self.logger.info(f"parse response, url: {response.url}")
for link in response.xpath("//a/@href").extract():
if not link.startswith('http'):
link = response.url + link
yield Request(url=link)
item = CommonExampleItem()
item['url'] = response.url
item['title'] = response.xpath("//title/text()").extract_first()
item["content"] = response.text
image_urls = []
for image_url in response.xpath('//a/img/@src').extract():
if image_url.endswith(('jpg', 'png')):
if not image_url.startswith('http'):
image_url = re.match("(.*?//.*?)/", response.url).group(1) + image_url
image_urls.append(image_url)
else:
image_urls.append(image_url)
item['image_urls'] = image_urls
file_urls = []
for file_url in response.xpath(
"//a[re:match(@href,'.*(\.docx|\.doc|\.xlsx|\.pdf|\.xls|\.zip)$')]/@href").extract():
if not file_url.startswith('http'):
file_url = re.match("(.*?//.*?)/", response.url).group(1) + file_url
file_urls.append(file_url)
else:
file_urls.append(file_url)
item['file_urls'] = file_urls
yield item
Finally, I hope that the author can add the dynamic web crawler tutorial. Thanks again!!!!