scrapy使用seleium和中间件

时间:2020-04-05 01:39:09   收藏:0   阅读:93

           代码实现:

spider

import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from wangyiPro.items import WangyiproItem
"""
爬取网易国内和国际新闻标题和内容
"""
class WangyiSpider(scrapy.Spider):
    name = wangyi
    # allowed_domains = [‘www.163.com‘]
    start_urls = [https://news.163.com/domestic/,https://news.163.com/world/]


    def __init__(self):
        options = webdriver.ChromeOptions()
        options.add_argument(--window-position=0,0);  # chrome 启动初始位置
        options.add_argument(--window-size=1080,800);  # chrome 启动初始大小
        self.browser = webdriver.Chrome(executable_path=C://xx//chromedriver.exe ,chrome_options=options)

    def parse(self, response):
       div_list =  response.xpath(//div[@class="ndi_main"]/div)
       for div_item in div_list:
           title = div_item.xpath(./div/div[1]/h3/a/text()).extract_first()
           new_detail_url=div_item.xpath(./div/div[1]/h3/a/@href).extract_first()
           item = WangyiproItem()
           item[title] = title

           # 对于新闻详情页发起request
           yield scrapy.Request(url= new_detail_url,callback=self.parse_detail,meta={item:item}) # 请求传参item

    # 解析新闻内容
    def parse_detail(self,response):
        content = response.xpath(//*[@id="endText"]//text()).extract()
        content = ‘‘.join(content)
        item = response.meta[item]
        item[content] = content.strip()

        yield item

    def closed(self,spider):
        self.browser.quit()

middleware

from scrapy import signals
from time import sleep
from scrapy.http import HtmlResponse
class WangyiproDownloaderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    # 拦截响应对象进行篡改
    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.
        #挑选指定的响应对象进行篡改
        #通过url指定request
        #通过request指定response
        #spider爬虫对象
        bro = spider.browser # 获取爬虫类定义的浏览器对象
        if request.url in spider.start_urls:
            #response   # 进行篡改 实例化新的响应对象(包含动态加载的新闻数据)替代原来的旧响应对象
            # 基于seleium便捷获取动态数据
            bro.get(request.url)
            sleep(3)
            bro.execute_script(window.scrollTo(0, document.body.scrollHeight))
            sleep(1)
            page_text = bro.page_source # 包含了动态加载对象
            new_response = HtmlResponse(url=request.url,body=page_text,encoding="utf-8",request=request)

            return new_response
        else:
            # response # 其他请求
        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
            return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info(Spider opened: %s % spider.name)

pipeline文件

import pymysql

class WangyiproPipeline(object):
    # 构造方法
    def __init__(self):
        self.conn = None  # 定义一个文件描述符属性
        self.cursor = None
        self.num = 0

    # 下列都是在重写父类的方法:
    # 开始爬虫时,执行一次
    def open_spider(self, spider):
        self.conn = pymysql.Connect(host=192.168.xx.xx, port=3306, user=root, password=xx, db=xx_db,
                                    charset=utf8)
        print(爬虫数据库开始)

    # 专门处理item对象
    # 因为该方法会被执行调用多次,所以文件的开启和关闭操作写在了另外两个只会各自执行一次的方法中。
    def process_item(self, item, spider):
        author = item[title]
        content = item[content]
        self.cursor = self.conn.cursor()
        try:

            self.cursor.execute(insert into qiubai values(%s,%s), (author, content))
            self.conn.commit()
        except Exception as e:
            print(e,content[0,20])
            self.conn.rollback()
        return item

    def close_spider(self, spider):
        print(爬虫数据库结束)
        self.cursor.close()
        self.conn.close()

 

items文件

 

class WangyiproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()
    pass

 

setting配置

USER_AGENT = Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36  # 伪装请求载体身份
# Obey robots.txt rules
# ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False  #可以忽略或者不遵守robots协议
#只显示指定类型的日志信息
LOG_LEVEL=ERROR

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,
#   ‘Accept-Language‘: ‘en‘,
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    ‘wangyiPro.middlewares.WangyiproSpiderMiddleware‘: 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   wangyiPro.middlewares.WangyiproDownloaderMiddleware: 543,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    ‘scrapy.extensions.telnet.TelnetConsole‘: None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   wangyiPro.pipelines.WangyiproPipeline: 300,
}

 

原文:https://www.cnblogs.com/xiao-apple36/p/12635470.html

评论(0
© 2014 bubuko.com 版权所有 - 联系我们:wmxa8@hotmail.com
打开技术之扣,分享程序人生!