这是一只天气爬虫,本程序未调用天气网API,纯通过模拟浏览器登陆进行实时抓取天气讯息,目前已经实现全国各大城市的天气查询功能,因才刚开始学习爬虫,对正则与bs4还不熟悉,所以做起来好痛苦,
如果大家有更方便、更高效的编写方式,欢迎提出建议哦!
如需获取全国各地的城市编号文件,请前往:http://bbs.fishc.com/forum.php?mod=viewthread&tid=74390&page=1&extra=#pid2622513 获取,不便之处请谅解!
如需获取全国各地的城市编号文件,请前往:http://bbs.fishc.com/forum.php?mod=viewthread&tid=74390&page=1&extra=#pid2622513 获取,不便之处请谅解!
代码如下:
#coding:utf-8
import requests,re,gzip
from bs4 import BeautifulSoup
import locale
import time
def searh(arg): #读取文件内容
dict = {}
with open('weatherdb.gd','r') as df:
line = df.readlines()
for i in line:
if i !='\n':
tem = i.strip().split('=')
dict[tem[1]]=tem[0]
if arg in dict:
post_up(dict[arg])
else:
print('抱歉,没有找到您所在的城市,或者请确认您输入的城市正确!')
def post_up(arg):#通过识别字典的value传值给主域名进行精确的地址搜寻
weather_url = 'http://www.weather.com.cn/weather/%s.shtml'%arg
s = requests.session()
herads = {
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8',
'Accept - Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.6',
'User - Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 51.02704.103Safari / 537.36',
'Accept-Encoding': 'gzip, deflate, sdch'
}
def ungzip(arg): #此函数用于解压经gzip压缩的网页
try:
print('尝试解压中.....')
gzip.decompress(arg)
gzip.open(arg)
print('解压完毕!')
except:
print('未经压缩,无需解压')
return arg
data = s.get(weather_url, headers=herads)
data.encoding='utf-8'
sup_weather(data)
def sup_weather(arg): #主函数,解析网页并输出
soup = BeautifulSoup(arg.text, "html.parser")
fand = soup.find_all('div', class_="c7d")
fand = BeautifulSoup(str(fand),"html.parser")
feng = fand.find_all(id="hidden_title")
fengxiang = fand.find_all('em')
if str(fengxiang[0])[-57:-56] == '=':
fengxiangs = str(fengxiang[0])[-55:-52]
else:
fengxiangs = str(fengxiang[0])[-57:-52]
re_find = r'value="(.*)"'
feng = re.findall(re_find,str(feng))
see = soup.find_all('li')
day = see[5]
fengli = day.find_all('i')
weathers = day.p.string
fenglis = fengli[1].string
data = str(see[12:30])
data = BeautifulSoup(data,"html.parser")
title_zishu = data.find_all('em')
text_zizhu = data.find_all(('span'))
cent_zishu = data.find_all('p')
title = []
text = []
cent = []
for i in range(0,10):
if title_zishu[i].string != None:
title.append(title_zishu[i].string)
if '天空' not in text_zizhu[i].string:
text.append(text_zizhu[i].string)
cent.append(cent_zishu[i].string)
locale.setlocale(locale.LC_CTYPE, 'chinese') #定义时间
now = time.strftime("%Y年%m月%d日",time.localtime(time.time()))
print('%s(今天):%s的本日天气为:%s\n录得室外最高温度为:%s℃ 摄氏度,最低为:%s℃摄氏度\n风力:%s(%s)'% (now,send,weathers,str(feng)[-9:-7],str(feng)[-6:-4],fenglis,fengxiangs))
print('以下是今天各项指数以及建议:')
print('%s:(%s)建议:%s' %(title[0],text[0],cent[0]))
print('%s:(%s)建议:%s' % (title[1], text[1], cent[1]))
print('%s:(%s)建议:%s' % (title[2], text[2], cent[2]))
print('%s:(%s)建议:%s' % (title[3], text[3], cent[3]))
print('%s:(%s)建议:%s' % (title[4], text[4], cent[4]))
print('%s:(%s)建议:%s' % (title[5], text[5], cent[5]))
print('欢迎使用全国天气查询服务!')
send = input('请输入您要查询的城市名称(精确到(市/县/直辖区)范围内):')
searh(send)
import requests,re,gzip
from bs4 import BeautifulSoup
import locale
import time
def searh(arg): #读取文件内容
dict = {}
with open('weatherdb.gd','r') as df:
line = df.readlines()
for i in line:
if i !='\n':
tem = i.strip().split('=')
dict[tem[1]]=tem[0]
if arg in dict:
post_up(dict[arg])
else:
print('抱歉,没有找到您所在的城市,或者请确认您输入的城市正确!')
def post_up(arg):#通过识别字典的value传值给主域名进行精确的地址搜寻
weather_url = 'http://www.weather.com.cn/weather/%s.shtml'%arg
s = requests.session()
herads = {
'Accept': 'text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, * / *;q = 0.8',
'Accept - Language':'zh - CN, zh;q = 0.8, zh - TW;q = 0.6',
'User - Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 51.02704.103Safari / 537.36',
'Accept-Encoding': 'gzip, deflate, sdch'
}
def ungzip(arg): #此函数用于解压经gzip压缩的网页
try:
print('尝试解压中.....')
gzip.decompress(arg)
gzip.open(arg)
print('解压完毕!')
except:
print('未经压缩,无需解压')
return arg
data = s.get(weather_url, headers=herads)
data.encoding='utf-8'
sup_weather(data)
def sup_weather(arg): #主函数,解析网页并输出
soup = BeautifulSoup(arg.text, "html.parser")
fand = soup.find_all('div', class_="c7d")
fand = BeautifulSoup(str(fand),"html.parser")
feng = fand.find_all(id="hidden_title")
fengxiang = fand.find_all('em')
if str(fengxiang[0])[-57:-56] == '=':
fengxiangs = str(fengxiang[0])[-55:-52]
else:
fengxiangs = str(fengxiang[0])[-57:-52]
re_find = r'value="(.*)"'
feng = re.findall(re_find,str(feng))
see = soup.find_all('li')
day = see[5]
fengli = day.find_all('i')
weathers = day.p.string
fenglis = fengli[1].string
data = str(see[12:30])
data = BeautifulSoup(data,"html.parser")
title_zishu = data.find_all('em')
text_zizhu = data.find_all(('span'))
cent_zishu = data.find_all('p')
title = []
text = []
cent = []
for i in range(0,10):
if title_zishu[i].string != None:
title.append(title_zishu[i].string)
if '天空' not in text_zizhu[i].string:
text.append(text_zizhu[i].string)
cent.append(cent_zishu[i].string)
locale.setlocale(locale.LC_CTYPE, 'chinese') #定义时间
now = time.strftime("%Y年%m月%d日",time.localtime(time.time()))
print('%s(今天):%s的本日天气为:%s\n录得室外最高温度为:%s℃ 摄氏度,最低为:%s℃摄氏度\n风力:%s(%s)'% (now,send,weathers,str(feng)[-9:-7],str(feng)[-6:-4],fenglis,fengxiangs))
print('以下是今天各项指数以及建议:')
print('%s:(%s)建议:%s' %(title[0],text[0],cent[0]))
print('%s:(%s)建议:%s' % (title[1], text[1], cent[1]))
print('%s:(%s)建议:%s' % (title[2], text[2], cent[2]))
print('%s:(%s)建议:%s' % (title[3], text[3], cent[3]))
print('%s:(%s)建议:%s' % (title[4], text[4], cent[4]))
print('%s:(%s)建议:%s' % (title[5], text[5], cent[5]))
print('欢迎使用全国天气查询服务!')
send = input('请输入您要查询的城市名称(精确到(市/县/直辖区)范围内):')
searh(send)