python爬虫

最近在玩Python,自己平时喜欢和同事看电影,就寻思写了个爬虫来定时爬取优惠电影。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# author: 13sai    
# code: utf-8
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
now = datetime.datetime.now()
week = now.weekday()
if week < 5:
week = 4 - week
else:
week = 12 - week
delta = datetime.timedelta(days=week)
n_days = now + delta
nowtime = n_days.strftime('%Y-%m-%d') # 每周五
jiage = int(30) #价格
url_gwl = urlopen("http://www.gewara.com/cinema/ajax/getOpiItemPage.xhtml?cid=cinemaid&mid=&fyrq="+nowtime).read().decode("utf-8")
soupurl_gwl = BeautifulSoup(url_gwl,"html.parser")
ids_gwl = soupurl_gwl.findAll("a",attrs ={"href":"javascript:void(0);"})
str = ''
str1 = ''
str2 = ''
for idattr_gwl in ids_gwl:
res_gwl = urlopen("http://www.gewara.com/movie/ajax/getOpiItemNew.xhtml?movieid="+idattr_gwl['id']+"&fyrq="+nowtime+"&cid=cinemaid").read().decode("utf-8")
soup_gwl = BeautifulSoup(res_gwl,"html.parser")
# print(soup)
links_gwl = soup_gwl.findAll("span",attrs ={"class":"opiPrice"})
arrtime_gwl = soup_gwl.findAll("span",attrs ={"class":"opitime"})
for i in range(0,len(links_gwl)):
if links_gwl[i].find("b") != None:
if int(links_gwl[i].find("b").get_text()) < jiage:
str1 = str1+"<p>"+arrtime_gwl[i].get_text()+"票价:"+links_gwl[i].find("b").get_text()+idattr_gwl.find('img')['alt']+"<a href='http://www.gewara.com/movie/ajax/getOpiItemNew.xhtml?movieid="+idattr_gwl['id']+"&fyrq="+nowtime+"&cid=cinemaid'>点击查看</a></p>"
url_tb = urlopen("http://dianying.taobao.com/showList.htm?spm=a1z21.3046609.w2.3.9ilG5t&n_s=new").read().decode("utf-8")
soupurl_tb = BeautifulSoup(url_tb,"html.parser")
ids_tb = soupurl_tb.findAll("a",attrs ={"class":"movie-card"})
for idattr_tb in ids_tb:
pattern = re.compile(r"showId=(\d+)")
id_group_tb = pattern.search(idattr_tb['href'])
if id:
res_tb = urlopen("http://dianying.taobao.com/cinemaDetailSchedule.htm?cinemaId=cinemaid&activityId=&fCode=&showId="+id_group_tb.group(1)+"&showDate="+nowtime).read().decode("utf-8")
soup_tb = BeautifulSoup(res_tb,"html.parser")
# print(soup)
links_tb = soup_tb.findAll("td",attrs ={"class":"hall-price"})
arrtime_tb = soup_tb.findAll("td",attrs ={"class":"hall-time"})
for i in range(0,len(links_tb)):
if links_tb[i].find("em") != None:
if float(links_tb[i].find("em").get_text()) < jiage:
name_tb = idattr_tb.find('span',attrs={"class":"bt-l"})
str2 = str2+"<p>"+arrtime_tb[i].get_text()+"票价:"+links_tb[i].find("em").get_text()+name_tb.get_text()+"<a href='http://dianying.taobao.com/cinemaDetailSchedule.htm?cinemaId=cinemaid&activityId=&fCode=&showId="+id_group_tb.group(1)+"&showDate="+nowtime+"'>点击查看</a></p>"
str1 = str1.strip()
str2 = str2.strip()
if str1 != '' or str2 != '':
from email import encoders
from email.header import Header
from email.mime.text import MIMEText
from email.utils import parseaddr, formataddr
import smtplib
# 第三方 SMTP 服务
mail_host="smtp.qq.com" #设置服务器
mail_user="@" #用户名
mail_pass="@" #口令,QQ邮箱是输入授权码
sender = '@'
receivers = ['@'] # 接收邮件,可设置为你的QQ邮箱或者其他邮箱
if str1 != '':
str = "格瓦拉:<br/>"+str1+"<br/><br/><br/><br/>"
if str2 != '':
str = str+"淘票票:<br/>"+str2+"<br/><br/><br/><br/>"
message = MIMEText(str, 'html', 'utf-8')
message['From'] = Header("@", 'utf-8')
message['To'] = Header("@", 'utf-8')
subject = '优惠电影'
message['Subject'] = Header(subject, 'utf-8')
try:
smtpObj = smtplib.SMTP_SSL(mail_host, 465)
smtpObj.login(mail_user,mail_pass)
smtpObj.sendmail(sender, receivers, message.as_string())
smtpObj.quit()
print("邮件发送成功")
except smtplib.SMTPException:
print(smtplib.SMTPException)
else:
print("暂无优惠电影")