from scrapy.http.cookies import CookieJar # 该模块继承自内置的http.cookiejar,操作类似 # 实例化一个cookiejar对象 cookie_jar = CookieJar() real_cookie = {} # 首先是cookie的提取 class MySpider(scrapy.Spider): .... .... # 模拟登陆,之后调用一个检查是否登录成功的函数 def login(self, response): .... return [scrapy.FormRequest( url=login_url, formdata = {'username':xxx, 'password':xxx}, callback = self.check_login )] def save_cookie(self, cookie_jar): for cookie in cookie_jar: p = re.compile(r'<Cookie (.*?) for .*?>') cookies = re.findall(p, str(cookie)) if '=' in cookies[0]: cookies = (cookie.split('=', 1) for cookie in cookies) tmp_dict = dict(cookies) for item in tmp_dict: self.real_cookie[item] = tmp_dict[item] def check_login(self, response): if 登录成功: # 到这里我们的登录状态已经写入到response header中的'Set-Cookies'中了, # 使用extract_cookies方法可以提取response中的cookie cookiejar.extract_cookies(response, response.request) self.save_cookie(cookiejar)
后面一些请求就可以带上这个cookie
scrapy.Request(url, callback=self.xxx, cookies=self.real_cookie)