# -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider, Rule # from scrapy.linkextractors.sgml import SgmlLinkExtractor from scrapy.linkextractors import LinkExtractor from CrawlSpiderTest.items import CrawlspidertestItem class CsdnarticleSpider(CrawlSpider): name = 'csdnArticle' allowed_domains = ['blog.csdn.net'] start_urls = ['https://blog.csdn.net/u012150179/article/details/11749017'] pagelink = LinkExtractor(allow=('/u012150179/article/details')) rules = [ Rule(pagelink, callback='parse_item', follow=True) ] def parse_item(self, response): item = CrawlspidertestItem() item['title'] = response.css('.title-article::text').extract_first() yield item # def parse(self, response): # pass3937
05
6月
scrapy CrawlSpider 爬全站数据
