python爬虫

最近在玩Python,自己平时喜欢和同事看电影,就寻思写了个爬虫来定时爬取优惠电影。

# author: puresai    
# code: utf-8    
from urllib.request import urlopen    
from bs4 import BeautifulSoup    
import re    
import datetime    
now = datetime.datetime.now()    
week = now.weekday()    
if week < 5:    
    week = 4 - week    
else:    
    week = 12 - week    
delta = datetime.timedelta(days=week)    
n_days = now + delta    
nowtime = n_days.strftime('%Y-%m-%d')  # 每周五    
jiage = int(30)  #价格    
url_gwl = urlopen("http://www.gewara.com/cinema/ajax/getOpiItemPage.xhtml?cid=cinemaid&mid=&fyrq="+nowtime).read().decode("utf-8")    
soupurl_gwl = BeautifulSoup(url_gwl,"html.parser")    
ids_gwl = soupurl_gwl.findAll("a",attrs ={"href":"javascript:void(0);"})    
str = ''    
str1 = ''    
str2 = ''    
for idattr_gwl in ids_gwl:    
    res_gwl = urlopen("http://www.gewara.com/movie/ajax/getOpiItemNew.xhtml?movieid="+idattr_gwl['id']+"&fyrq="+nowtime+"&cid=cinemaid").read().decode("utf-8")    
    soup_gwl = BeautifulSoup(res_gwl,"html.parser")    
    # print(soup)    
    links_gwl = soup_gwl.findAll("span",attrs ={"class":"opiPrice"})    
    arrtime_gwl = soup_gwl.findAll("span",attrs ={"class":"opitime"})    
    for i in range(0,len(links_gwl)):    
    if links_gwl[i].find("b") != None:    
           if int(links_gwl[i].find("b").get_text()) < jiage:    
               str1 = str1+"<p>"+arrtime_gwl[i].get_text()+"票价:"+links_gwl[i].find("b").get_text()+idattr_gwl.find('img')['alt']+"<a href='http://www.gewara.com/movie/ajax/getOpiItemNew.xhtml?movieid="+idattr_gwl['id']+"&fyrq="+nowtime+"&cid=cinemaid'>点击查看</a></p>"    
url_tb = urlopen("http://dianying.taobao.com/showList.htm?spm=a1z21.3046609.w2.3.9ilG5t&n_s=new").read().decode("utf-8")    
soupurl_tb = BeautifulSoup(url_tb,"html.parser")    
ids_tb = soupurl_tb.findAll("a",attrs ={"class":"movie-card"})    
for idattr_tb in ids_tb:    
    pattern = re.compile(r"showId=(\d+)")    
    id_group_tb = pattern.search(idattr_tb['href'])    
    if id:    
        res_tb = urlopen("http://dianying.taobao.com/cinemaDetailSchedule.htm?cinemaId=cinemaid&activityId=&fCode=&showId="+id_group_tb.group(1)+"&showDate="+nowtime).read().decode("utf-8")    
        soup_tb = BeautifulSoup(res_tb,"html.parser")    
        # print(soup)    
        links_tb = soup_tb.findAll("td",attrs ={"class":"hall-price"})    
        arrtime_tb = soup_tb.findAll("td",attrs ={"class":"hall-time"})    
        for i in range(0,len(links_tb)):    
        if links_tb[i].find("em") != None:    
               if float(links_tb[i].find("em").get_text()) < jiage:    
                   name_tb = idattr_tb.find('span',attrs={"class":"bt-l"})    
                   str2 = str2+"<p>"+arrtime_tb[i].get_text()+"票价:"+links_tb[i].find("em").get_text()+name_tb.get_text()+"<a href='http://dianying.taobao.com/cinemaDetailSchedule.htm?cinemaId=cinemaid&activityId=&fCode=&showId="+id_group_tb.group(1)+"&showDate="+nowtime+"'>点击查看</a></p>"    
str1 = str1.strip()    
str2 = str2.strip()    
if str1 != '' or str2 != '':    
    from email import encoders    
    from email.header import Header    
    from email.mime.text import MIMEText    
    from email.utils import parseaddr, formataddr    
    import smtplib    
    # 第三方 SMTP 服务      
    mail_host="smtp.qq.com"  #设置服务器      
    mail_user="@"    #用户名      
    mail_pass="@"   #口令,QQ邮箱是输入授权码    
    sender = '@'      
    receivers = ['@']  # 接收邮件,可设置为你的QQ邮箱或者其他邮箱      
    if str1 != '':    
        str = "格瓦拉:<br/>"+str1+"<br/><br/><br/><br/>"    
    if str2 != '':    
        str = str+"淘票票:<br/>"+str2+"<br/><br/><br/><br/>"    
    message = MIMEText(str, 'html', 'utf-8')    
    message['From'] = Header("@", 'utf-8')    
    message['To'] =  Header("@", 'utf-8')    
    subject = '优惠电影'     
    message['Subject'] = Header(subject, 'utf-8')    
    try:    
    smtpObj = smtplib.SMTP_SSL(mail_host, 465)    
    smtpObj.login(mail_user,mail_pass)    
    smtpObj.sendmail(sender, receivers, message.as_string())    
    smtpObj.quit()    
    print("邮件发送成功")    
    except smtplib.SMTPException:    
    print(smtplib.SMTPException)    
else:    
    print("暂无优惠电影")

python爬虫
https://blog.puresai.com/2017/01/14/98/
作者
puresai
许可协议