-->

分享一份我最近写的一个基于小猪短租网的小爬虫


大家好,感觉好久没发博客了,今天也分享下自己的劳动成果吧!这是一个基于爬取租房信息的爬虫,代码比较简洁大方,如果您刚学习爬虫的话,不妨参考下哦,当然这对于大牛而言也没什么大不了的!代码中使用了多元表达式,如果您看不懂,欢迎留言!另外如果您有更好的爬取方式,欢迎分享!


#coding:utf-8

import requests,time

from bs4 import BeautifulSoup

url = [r'http://gz.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,13+1)]

#urle = r'http://gz.xiaozhu.com/search-duanzufang-p1-0/'

headers = {

'Cookie':'abtest_ABTest4SearchDate=b; gr_user_id=85e9a01a-a0e4-4626-b3fa-bbebc1e43a8c; OZ_1U_2283=vid=v81756d34bb73d.0&ctime=1477924862&ltime=1477924861; OZ_1Y_2283=erefer=-&eurl=http%3A//m.xiaozhu.com/search.html%3Fcityid%3D16%26city%3D%2525E5%2525B9%2525BF%2525E5%2525B7%25259E%26offset%3D1%26step%3D15%26st%3D2016-10-31%26et%3D2016-11-01%26&etime=1477924563&ctime=1477924862&ltime=1477924861&compid=2283; xzuuid=aeacf51c; _gat_UA-33763849-7=1; __utmt=1; OZ_1U_2282=vid=v81732ffe11205.0&ctime=1477926800&ltime=1477926758; OZ_1Y_2282=erefer=https%3A//www.google.com.hk/&eurl=http%3A//www.xiaozhu.com/&etime=1477915390&ctime=1477926800&ltime=1477926758&compid=2282; _ga=GA1.2.1377836010.1477915392; __utma=29082403.1377836010.1477915392.1477915401.1477921654.2; __utmb=29082403.29.10.1477921654; __utmc=29082403; __utmz=29082403.1477915401.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); startDate=2016-10-31; endDate=2016-11-01; gr_session_id_59a81cc7d8c04307ba183d331c373ef6=11901107-c72c-46d7-af6c-a78cc7f8bc82',

'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',

}

def Details_page(urlss):

web_data = requests.get(urlss)

soup = BeautifulSoup(web_data.text,'lxml')

title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')

adders = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')

price = soup.select('#pricePart > div.day_l')

img = soup.select('#curBigImage')

hum_img = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')

name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')

sexy = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span') #get class

for titles,adderss,prices,names,sexys,hum_imgs,imgs in zip(title,adders,price,name,sexy,hum_img,img):

print('标题:' + titles.get_text().strip())

print('地址:' + adderss.get_text().strip())

print('价格:'+ prices.get_text().strip() + ' 元/晚')

print('房东昵称:' + names.get_text().strip())

print('房东性别:' + ('空' if sexys.get('class')[0]== ''else ('男' if sexys.get('class')[0] == 'member_boy_ico' else '女') ))

print('主图链接地址:' + imgs.get('src'))

print('房东头像链接地址:' + hum_imgs.get('src').strip())

DownloadImage_File(hum_imgs.get('src').strip(),names.get_text().strip())


def channel(urls):

web_data = requests.get(urls,headers=headers)

time.sleep(2)

soup = BeautifulSoup(web_data.text,'lxml')

link = soup.select('#page_list > ul > li > a')

for i in link:

Details_page(i.get('href'))


def DownloadImage_File(img_Url,name):

Load_img = requests.get(img_Url, stream=True)

with open('D:/python工程/四周/01-02/img/'+ name + '.jpg', 'wb') as df:

for chunk in Load_img.iter_content(chunk_size=1024):

df.write(chunk)

df.flush()

for get_url in url:




channel(get_url)

0 评论:

发表评论