主要是对上一篇文章的简单仿写,大家以后想批量下载什么图片照格式仿写就好。由于本人是tfboys的粉丝,所以平常没事爱逛贴吧欣赏我家三小只的美图,所以这次就以贴吧[小王的讨论楼]为例,批量爬取该楼的图片[1]
itme.py编写
import scrapyclass WangyuantuItem(scrapy.Item): image_urls=scrapy.Field()#就编写个图片路径就好
spider的编写
import scrapyimport requestsimport osfrom wangyuantu.items import WangyuantuItemclass XiaowangSpider(scrapy.Spider): name = "xiaowang" allowed_domains = ["tieba.baidu.com/p/3888309273"] start_urls = [ 'http://tieba.baidu.com/p/3888309273?pn=%d' % i for i in range(21,45) ] def parse(self, response): item = WangyuantuItem() item['image_urls']=response.xpath("//img[@class='BDE_Image']/@src").extract() yield item
pipelines编写:这个部分都是可以套用的
import requestsfrom wangyuantu import settingsimport os#图片下载类class ImageDownloadPipeline(object): def process_item(self, item, spider): if 'image_urls' in item:#如何‘图片地址’在项目中 images = []#定义图片空集 dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item['image_urls']: us = image_url.split('/')[3:] image_file_name = '_'.join(us) file_path = '%s/%s' % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, 'wb') as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break
settings编写
BOT_NAME = 'wangyuantu'SPIDER_MODULES = ['wangyuantu.spiders']NEWSPIDER_MODULE = 'wangyuantu.spiders'ITEM_PIPELINES = {'wangyuantu.pipelines.ImageDownloadPipeline': 1}#图片储存IMAGES_STORE = 'C:\Users\Lenovo\Pictures'
结果
寄语:wili源源小可爱,希望你快快乐乐的长大