百度贴吧爬虫程序

时间:2017-11-12 20:47:20   收藏:0   阅读:290

#coding:utf-8

import requests

import random



class TiebaSpider:

    def __init__(self,tieba_name):

        self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36‘}

        self.tieba_name = tieba_name

        self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"


    def get_url_list(self):

        url_list = [self.url_temp.format(i*50) for i in range(0,30)]

        return url_list


    def parse_url(self,url):

        print(‘正在请求%s‘ % url)

        res = requests.get(url,headers = self.headers)

        return res.content.decode()


    def save_html_str(html_str,page_num):

        print(‘正在保存第%s页.html‘ % page_num)

        file_name = str(page_num)+‘.html‘

        with open(file_name,‘w‘) as f:

            f.write(html_str)

            print(‘保存%s成功‘ % file_name)


    def run(self):

        #1.实现主要逻辑

        url_list = self.get_url_list()

        #2.遍历列表,发送请求,获取响应

        for url in  url_list:

            html_str = self.parse_url(url)

            #3.保存

            page_num = url_list.index(url) + 1

            self.save_html_str(html_str,page_num)


if __name__ == "__main__":

    tieba_name = input(‘请输入要贴吧名:‘)

    tieba = TiebaSpider(tieba_name)

    tieba.run()


本文出自 “梦女孩” 博客,请务必保留此出处http://dreamgirl1314.blog.51cto.com/1159474/1981063

原文:http://dreamgirl1314.blog.51cto.com/1159474/1981063

评论(0
© 2014 bubuko.com 版权所有 - 联系我们:wmxa8@hotmail.com
打开技术之扣,分享程序人生!