淘宝产品抓取实战

时间:2019-03-13 16:53:30   收藏:0   阅读:285
#!coding=utf-8
import requests
import re
import time
import json
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import pandas as pd
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)  ###禁止提醒SSL警告
 
class tb(object):####手机端
     def __init__(self,path,seach):  ###保存数据路径
        self.path = path  ###保存数据路径
        self.seach= seach ##搜索词
        self.s = requests.session()
        headers = {
            Host:s.m.taobao.com,
            Accept-Encoding:br, gzip, deflate,
            Connection:keep-alive,
            Accept:application/json,
            User-Agent:Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/16A366 Safari/605.1.15,
            Accept-Language:zh-cn,
            X-Requested-With:XMLHttpRequest,
                   }
        self.s.headers.update(headers)  ##插入头信息
 
 
    def seachdata(self):
        for i in range(0,100):
            time.sleep(1.25)
            url=https://s.m.taobao.com/search?event_submit_do_new_search_auction=1&_input_charset=utf-8&topSearch=1&atype=b&searchfrom=1&action=home%3Aredirect_app_action&from=1&q={}&sst=1&n=20&buying=buyitnow&m=api4h5&abtest=18&wlsort=18&style=list&closeModues=nav%2Cselecthot%2Conesearch&page={}.format(self.seach,i)   ##爬取的网址
            print(i)
            req = self.s.get(url=url, verify=False).text  #爬取页面结果
            try:
                js=json.loads(req)
                print(js)
            except:
                print(err)
            listItem=js[listItem]
            title=[]  ##名称
            sold=[]   ##月销量
            commentCount=[]  ##评论量
            item_id=[]  ##商品ID
            userId=[]   ##商家ID
            nick=[]  ##商家名称
            location=[]  ##商家地址
            pic_path=[]  ##图片
            itemNumId=[]  ##商品NID
            originalPrice=[]  ##原价
            price=[]  ##售价
            category=[]  ##类别ID
            itemurl=[]  ##商品链接
            if listItem==[]:
                break
 
            for j in listItem:  ##数据提取
 
                title.append(j[title])
                sold.append(j[sold])
                try:
                    commentCount.append(j[commentCount])
                except:
                    commentCount.append(‘‘)
                item_id.append(j[item_id])
                userId.append(j[userId])
                nick.append(j[nick])
                location.append(j[location])
                pic_path.append(j[pic_path])
                itemNumId.append(j[itemNumId])
                originalPrice.append(j[originalPrice])
                price.append(j[price])
                try:
                    category.append(j[category])
                except:
                    category.append(‘‘)
                itemurl.append(j[url])
                data={
                    title_名称:title,
                    sold_月销量: sold,
                    commentCount_评论量: commentCount,
                    item_id_商品ID: item_id,
                    userId_商家ID: userId,
                    nick_商家名称: nick,
                    location_商家地址: location,
                    pic_path_图片: pic_path,
                    itemNumId_商品NID: itemNumId,
                    originalPrice_原价: originalPrice,
                    price_售价: price,
                    category_类别ID: category,
                    itemurl_商品链接: itemurl,
                            }
 
                df=pd.DataFrame(data)
                if i==0:
                    df.to_csv(self.path+r\out.csv, index=False, header=1, encoding="GB18030")
                else:
                    df.to_csv(self.path+r\out.csv, index=False, header=0, mode=a, encoding="GB18030")###保存文件
 
 
if __name__ == __main__:
    t=tb(rE:\taobao,手机)
    t.seachdata()

 

原文:https://www.cnblogs.com/chenxi188/p/10524190.html

评论(0
© 2014 bubuko.com 版权所有 - 联系我们:wmxa8@hotmail.com
打开技术之扣,分享程序人生!