1. 安装chrome,chromedriver
chrome浏览器要升级到最新版本。
用brew安装chromedriver
用pip安装 Selenium
brew tap caskroom/cask brew cask install chromedriver
2. 写middleware
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # https://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals from .useragent import agents import random import base64 from tutorial import settings from selenium import webdriver from selenium.common.exceptions import TimeoutException # from selenium.webdriver.common.by import By # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC from scrapy.http import HtmlResponse from logging import getLogger from selenium.webdriver.chrome.options import Options class SeleniumMiddleware(): def __init__(self, timeout=None, service_args=[]): self.logger = getLogger(__name__) self.timeout = timeout chrome_options = Options() chrome_options.add_argument('--headless') #chrome_options.add_argument('--disable-gpu') self.driver = webdriver.Chrome(chrome_options=chrome_options) def __del__(self): self.driver.close() def process_request(self, request, spider): if True: #request.headers['Referer'] = 'http://jandan.net/ooxx'; try: self.driver.get(request.url) return HtmlResponse(url=request.url, body=self.driver.page_source, request=request, encoding='utf-8', status=200) except TimeoutException: return HtmlResponse(url=request.url, status=500, request=request)
3. settings
DOWNLOADER_MIDDLEWARES = { 'tutorial.middlewares.SeleniumMiddleware': 543, }
4.其他说明
爬虫里的写法不变,这边主要是写了一个downloader middleware,返回一个用我们chrome driver请求后的page_source,这样parse里就是返回后的内容了。这样就可以爬javascript 渲染后的页面,启动后,感觉性能要比那个splash要差。。。
另外本来是用PhantomJs,但是运行后,提示这个不推荐使用了。。。推荐使用firefox和chrome的headless版本。。
3322