爬虫的数据解析

时间:2019-05-28 23:33:34   收藏:0   阅读:158

爬虫的数据解析

两种爬取图片的方法

import requests
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}

img_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1559019106959&di=3aa954df95d2e55083d85de8391118c5&imgtype=0&src=http%3A%2F%2Fimg3.duitang.com%2Fuploads%2Fitem%2F201601%2F28%2F20160128195606_xvawC.jpeg'

img_data = requests.get(url=img_url,headers=headers).content

with open('./meinv.jpg','wb') as fp:
    fp.write(img_data)
from urllib import request
img_url = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1559019106959&di=3aa954df95d2e55083d85de8391118c5&imgtype=0&src=http%3A%2F%2Fimg3.duitang.com%2Fuploads%2Fitem%2F201601%2F28%2F20160128195606_xvawC.jpeg'
request.urlretrieve(img_url,'./meishaonv.jpg')

数据解析

常用的python数据解析有四种方式:

数据解析的原理:

1.正则解析:

对页面的分析:

可以看到要提取img标签中的src属性

<div class="thumb">

<a href="/article/121858876" target="_blank">
<img src="//pic.qiushibaike.com/system/pictures/12185/121858876/medium/VFMNF2Z1GNEYXR3A.jpg" alt="求大神制作">
</a>

</div>

实现:

import os

import requests
import re
from urllib import request
if not os.path.exists('./qiutu'):
    os.mkdir('./qiutu')
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
}

url = 'https://www.qiushibaike.com/pic/'
page_text = requests.get(url=url,headers=headers).text

ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
img_url = re.findall(ex,page_text,re.S)
for url in img_url:
    url = 'https:'+url
    img_name = url.split('/')[-1]
    img_path = './qiutu/'+img_name
    request.urlretrieve(url,img_path)
    print(img_name,'下载成功!!!')

2.bs4解析(beautifulsoup4)


bs4模块的基本使用

示例的文件为当前准备的test.html文件

from bs4 import BeautifulSoup

fp = open('./test.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')

1.soup.tagName:只可以定位到第一次出现的tagName标签

soup.title
soup.div

技术分享图片

2.soup.find(‘tagName’)

#soup.find(‘tagName’)
soup.find('a')  # soup.a
#属性定位
soup.find('div',class_='song')

结果如下

<div class="song">
<p>李清照</p>
<p>王安石</p>
<p>苏轼</p>
<p>柳宗元</p>
<a href="http://www.song.com/" target="_self" title="赵匡胤">
<span>this is span</span>
        宋朝是最强大的王朝,不是军队的强大,而是经济很强大,国民都很有钱</a>
<a class="du" href="">总为浮云能蔽日,长安不见使人愁</a>
<img alt="" src="http://www.baidu.com/meinv.jpg"/>
</div>

3.find_all

soup.find_all(‘div’)[2]  # 找到所有的div标签中的第3个,

4.select(‘选择器’)

soup.select('.song')  # 找到类为.song的标签
soup.select('div')
soup.select('.tang > ul > li > a')
soup.select('.tang a')

示例:爬取古诗文网的三国演义小说

url = 'http://www.shicimingju.com/book/sanguoyanyi.html'
page_text = requests.get(url=url,headers=headers).text
#数据解析:标题和url
soup = BeautifulSoup(page_text,'lxml')
li_list = soup.select('.book-mulu > ul > li')
fp = open('./sanguo.txt','w',encoding='utf-8')
for li in li_list:
    title = li.a.string
    detail_url = 'http://www.shicimingju.com'+li.a['href']
#     print(title,detail_url)
    #单独对详情页发起请求获取源码数据
    detail_page_text = requests.get(url=detail_url,headers=headers).text
    soup = BeautifulSoup(detail_page_text,'lxml')
    content = soup.find('div',class_="chapter_content").text
    
    fp.write(title+'\n'+content+'\n')
    print(title,':下载成功!')
    
fp.close()

3.xpath解析

from lxml import etree
tree = etree.parse('./test.html')

定位title标签

tree.xpath('/html/head/title')
tree.xpath('/html//title')
tree.xpath('//title')

定位class=song的div

tree.xpath('//div[@class="song"]')
tree.xpath('//div[2]') #xpath表达式中的索引是从1开始

取文本

#取文本(获取李清照) /text()  //text()
tree.xpath('//div[@class="song"]/p[1]/text()')[0]
tree.xpath('//div[@class="song"]/text()')

爬取boss直聘的python爬虫岗位信息

#boss直聘
url = 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position='
page_text = requests.get(url=url,headers=headers).text

#数据解析(jobTitle,salary,company)
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
    title = li.xpath('.//div[@class="job-title"]/text()')[0]
    salary = li.xpath('.//span[@class="red"]/text()')[0]
    company = li.xpath('.//div[@class="company-text"]/h3/a/text()')[0]
    print(title,salary,company)

原文:https://www.cnblogs.com/robertx/p/10940861.html

评论(0
© 2014 bubuko.com 版权所有 - 联系我们:wmxa8@hotmail.com
打开技术之扣,分享程序人生!