安全资讯网站爬虫

小编收到了 小阔爱 发来的文章在2020年2月20号20点20分和大家分享一下

使用python的flask架构运行

预览效果

下面是源码:

运行环境 python2+windows10(其他的也行)

下载所需第三方库:

pip install flask

pip install bs4

pip install requests

运行方式: python sawyer.py

然后打开浏览器,访问 127.0.0.1 :5000

真实环境中建议关掉flask的 debug模式 ,因为网页报错输入PIN码就能进入python的交互界面了

# -*- coding: utf-8 -*-

from flask import Flask,redirect,url_for

import re,os,requests

from bs4 import BeautifulSoup as bsp

app = Flask(__name__)

#####FREEBUF##########################################################################################################################################

@app.route("/freebuf/<number>")

def get_freebuf_message(number):

url="http://www.freebuf.com/page/"+str(number)

#print(str(url))

header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"}

res=requests.get(url,headers=header)

soup=bsp(res.text,"html.parser")

result_title=["<strong>FREEBUF 当前浏览的页数:"+str(number)+"</strong><br><HR width='100%' color=#d9d6c3 SIZE=5>"]

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/freebuf/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

a=soup.find_all('div',class_="news_inner")

for i in a:

try:

#print("标题:"+str(i.a['title']))

#result_title.append("<p>"+str(i)+"</p>")

#result_title.append("<p>标题:"+str(i.find(class_="news-info").get_text())+"</p>")

#print(str(i.find(class_="news-info").a['title']))

result_title.append("<strong><a href="+str(i.find(class_="news-info").a['href'])+">"+str(i.find(class_="news-info").a['title'])+"</a></strong>")

result_title.append("<br><span>日期:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='time').get_text().lstrip().rstrip())+"</a></strong></span>")

result_title.append("<br><span>作者:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(rel='author').get_text())+"</a></strong></span>")

result_title.append("<br><span>标签:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='tags').get_text().lstrip().rstrip())+"</a></strong></span>")

result_title.append("<br><span>浏览:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='look').get_text().lstrip().rstrip().replace(' ','').replace('\n',''))+"</a></strong></span>")

result_title.append("<p>摘要:</p><code>"+str(i.find(class_='text').get_text().lstrip().rstrip())+"</code>")

#print(str(i.find(class_='news-img')))

result_title.append("<br>"+str(i.find(class_='news-img').img))

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

except Exception as e:

result_title.append("有一些错误!!!<HR width='100%' color=#d9d6c3 SIZE=5>")

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/freebuf/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

str_title="".join(result_title)

return str_title

@app.route("/freebuf/")

def to_freebuf():

return redirect(url_for('get_freebuf_message',number=1))

#####aqniu#########################################################################################################################################

@app.route("/aqniu/<number>")

def get_aqniu_message(number):

url="http://www.aqniu.com/page/"+str(number)

#print(str(url))

header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"}

res=requests.get(url,headers=header)

soup=bsp(res.text,"html.parser")

result_title=["<strong>安全牛 当前浏览的页数:"+str(number)+"</strong><br><HR width='100%' color=#d9d6c3 SIZE=5>"]

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/aqniu/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

a=soup.find_all('div',class_="layout_3--item")

print(type(a))

jjj=0

for name in a:

if name not in a:

a.append(name)

for i in a:

jjj=jjj+1

if jjj==5:

break

try:

result_title.append("<strong><a href="+str(i.a['href'])+">"+str(i.find(class_="col-md-7").p)+"</a></strong>")

result_title.append("<br><span>日期:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='date').get_text().lstrip().rstrip())+"</a></strong></span>")

result_title.append("<br><span>作者:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(rel='author').get_text())+"</a></strong></span>")

result_title.append("<br><span>标签:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='cat').get_text().lstrip().rstrip().replace(' ','').replace('\n','').replace('标签:',''))+"</a></strong></span>")

result_title.append("<br><span>浏览:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(i.find(class_='views').get_text().lstrip().rstrip().replace(' ','').replace('\n',''))+"</a></strong></span>")

result_title.append("<p>摘要:</p><code>"+str(i.p.text.lstrip().rstrip().replace(' ','').replace('\n',''))+"</code>")

result_title.append("<br>"+str(i.a.img))

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

except Exception as e:

result_title.append("有一些错误!!!<HR width='100%' color=#d9d6c3 SIZE=5>")

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/aqniu/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

str_title="".join(result_title)

return str_title

@app.route("/aqniu/")

def to_aqniu():

return redirect(url_for('get_aqniu_message',number=1))

#####安全客###########################################################################################################################################

@app.route("/anquanke/<number>")

def get_anquanke_message(number):

url="https://api.anquanke.com/data/v1/posts?page="+str(number)+"&size=20"

#print(str(url))

headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"}

response = requests.get(url, headers=headers)

json = response.json()

data = json.get('data')

result_title=[]

result_title.append("<body>")

result_title.append("<strong>安全客 当前浏览的页数:"+str(number)+"</strong><br><HR width='100%' color=#d9d6c3 SIZE=5>")

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/anquanke/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

for item in data:

try:

if item.get('title') is not None:

result_title.append("<strong><a href="+str("https://www.anquanke.com/post/id/"+str(item.get('id')))+">"+str(item.get('title'))+"</a></strong>")

result_title.append("<br><span>日期:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(item.get('date'))+"</a></strong></span>")

result_title.append("<br><span>作者:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(item.get('author')["nickname"])+"</a></strong></span>")

result_title.append("<br><span>标签:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(item.get('category_name'))+"</a></strong></span>")

result_title.append("<br><span>浏览:<strong><a style='font-size:80%;color:#4e72b8;'>"+str(item.get('pv'))+"</a></strong></span>")

result_title.append("<p>摘要:</p><code>"+str(item.get('desc'))+"</code>")

result_title.append("<br><img border='0' src="+str(item.get('cover'))+" width='304' height='228'/>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

except Exception as e:

result_title.append("有一些错误!!!<HR width='100%' color=#d9d6c3 SIZE=5>")

result_title.append("<input type='button' name='Submit' onclick='javascript:history.back(-1);' value='上一页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/\";' value='回首页'>")

result_title.append("<input type='button' name='Submit' onclick='javascript:top.location=\"/anquanke/"+str(int(number)+1)+"\";' value='下一页'>")

result_title.append("<HR width='100%' color=#d9d6c3 SIZE=5>")

result_title.append("</body>")

str_title="".join(result_title)

return str_title

@app.route("/anquanke/")

def to_anquanke():

return redirect(url_for('get_anquanke_message',number=1))

######################################################################################################################################################

@app.route("/")

def index():

context=[]

context.append("<body style='text-align: center'><strong><h1>安全资讯网站爬虫</h1></strong>")

context.append("<img src='http://tm-image.qccip.com/qcctmImage/188271e451ab397b192d5d56ba2fa4a2.jpg' alt='FREEBUF' style='background-color:#00ae9d;width: 25%; height: 25%;'/>")

context.append("<br><strong><a href='/freebuf/1'>"+"FREEBUF"+"</a></strong><br>")

context.append("<img src='https://p0.ssl.qhimg.com/t01dcf171a151ec5398.png' style='background-color:#00ae9d;width: 25%; height: 25%;'/>")

context.append("<br><strong><a href='/anquanke/1'>"+"安全客(anquanke)"+"</a></strong><br>")

context.append("<img src='https://www.aqniu.com/wp-content/themes/anquanniu/images/aqniu-logo.svg' style='background-color:white;width: 25%; height: 25%;'/>")

context.append("<br><strong><a href='/aqniu/1'>"+"安全牛(aqniu)"+"</a></strong><br>")

context.append("<script src='https://cdn.bootcss.com/canvas-nest.js/2.0.4/canvas-nest.js' color='0,205,205' opacity='0.7' zIndex='-2' count='200'></script></body>")

#context.append("</body>")

str_context="".join(context)

return str_context

if __name__ == '__main__':

app.run(debug=True)

如果你有什么想学的想看的想问的都可以加我奥

广告时间

我来评几句
登录后评论

已发表评论数()

相关站点

热门文章