大家好,感觉好久没发博客了,今天也分享下自己的劳动成果吧!这是一个基于爬取租房信息的爬虫,代码比较简洁大方,如果您刚学习爬虫的话,不妨参考下哦,当然这对于大牛而言也没什么大不了的!代码中使用了多元表达式,如果您看不懂,欢迎留言!另外如果您有更好的爬取方式,欢迎分享!
#coding:utf-8
import requests,time
from bs4 import BeautifulSoup
url = [r'http://gz.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,13+1)]
#urle = r'http://gz.xiaozhu.com/search-duanzufang-p1-0/'
headers = {
'Cookie':'abtest_ABTest4SearchDate=b; gr_user_id=85e9a01a-a0e4-4626-b3fa-bbebc1e43a8c; OZ_1U_2283=vid=v81756d34bb73d.0&ctime=1477924862<ime=1477924861; OZ_1Y_2283=erefer=-&eurl=http%3A//m.xiaozhu.com/search.html%3Fcityid%3D16%26city%3D%2525E5%2525B9%2525BF%2525E5%2525B7%25259E%26offset%3D1%26step%3D15%26st%3D2016-10-31%26et%3D2016-11-01%26&etime=1477924563&ctime=1477924862<ime=1477924861&compid=2283; xzuuid=aeacf51c; _gat_UA-33763849-7=1; __utmt=1; OZ_1U_2282=vid=v81732ffe11205.0&ctime=1477926800<ime=1477926758; OZ_1Y_2282=erefer=https%3A//www.google.com.hk/&eurl=http%3A//www.xiaozhu.com/&etime=1477915390&ctime=1477926800<ime=1477926758&compid=2282; _ga=GA1.2.1377836010.1477915392; __utma=29082403.1377836010.1477915392.1477915401.1477921654.2; __utmb=29082403.29.10.1477921654; __utmc=29082403; __utmz=29082403.1477915401.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); startDate=2016-10-31; endDate=2016-11-01; gr_session_id_59a81cc7d8c04307ba183d331c373ef6=11901107-c72c-46d7-af6c-a78cc7f8bc82',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
}
def Details_page(urlss):
web_data = requests.get(urlss)
soup = BeautifulSoup(web_data.text,'lxml')
title = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > h4 > em')
adders = soup.select('body > div.wrap.clearfix.con_bg > div.con_l > div.pho_info > p > span')
price = soup.select('#pricePart > div.day_l')
img = soup.select('#curBigImage')
hum_img = soup.select('#floatRightBox > div.js_box.clearfix > div.member_pic > a > img')
name = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > a')
sexy = soup.select('#floatRightBox > div.js_box.clearfix > div.w_240 > h6 > span') #get class
for titles,adderss,prices,names,sexys,hum_imgs,imgs in zip(title,adders,price,name,sexy,hum_img,img):
print('标题:' + titles.get_text().strip())
print('地址:' + adderss.get_text().strip())
print('价格:'+ prices.get_text().strip() + ' 元/晚')
print('房东昵称:' + names.get_text().strip())
print('房东性别:' + ('空' if sexys.get('class')[0]== ''else ('男' if sexys.get('class')[0] == 'member_boy_ico' else '女') ))
print('主图链接地址:' + imgs.get('src'))
print('房东头像链接地址:' + hum_imgs.get('src').strip())
DownloadImage_File(hum_imgs.get('src').strip(),names.get_text().strip())
def channel(urls):
web_data = requests.get(urls,headers=headers)
time.sleep(2)
soup = BeautifulSoup(web_data.text,'lxml')
link = soup.select('#page_list > ul > li > a')
for i in link:
Details_page(i.get('href'))
def DownloadImage_File(img_Url,name):
Load_img = requests.get(img_Url, stream=True)
with open('D:/python工程/四周/01-02/img/'+ name + '.jpg', 'wb') as df:
for chunk in Load_img.iter_content(chunk_size=1024):
df.write(chunk)
df.flush()
for get_url in url:
channel(get_url)