文艺一把!Python爬取读者制作PDF!
- 作者: 啊一一长城
- 来源: 51数据库
- 2022-08-12
#!/usr/bin/env python
#coding=utf-8
"""
Author: Anemone
Filename: getmain.py
Last modified: 2015-02-19 16:47
E-mail: anemone@82flex.com
"""
import urllib2
from bs4 import BeautifulSoup
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def getEachArticle(url):
# response = urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html')
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)#.decode("utf-8").encode("gbk"))
#for i in soup.find_all('div'):
# print i,1
title=soup.find("h1").string
writer=soup.find(id="pub_date").string.strip()
_from=soup.find(id="media_name").string.strip()
text=soup.get_text()#.encode("utf-8")
main=re.split("BAIDU_CLB.*;",text)
result={"title":title,"writer":writer,"from":_from,"context":main[1]}
return result
#new=open("new.txt","w")
#new.write(result["title"]+"\n\n")
#new.write(result["writer"]+" "+result["from"])
#new.write(result["context"])
#new.close()
def getCatalog(issue):
url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/"
firstUrl=url+"duzh"+issue+"01.html"
firstUrl=url+"index.html"
duzhe=dict()
response = urllib2.urlopen(firstUrl)
html = response.read()
soup=BeautifulSoup(html)
firstUrl=url+soup.table.a.get("href")
response = urllib2.urlopen(firstUrl)
html = response.read()
soup = BeautifulSoup(html)
all=soup.find_all("h2")
for i in all:
print i.string
duzhe[i.string]=list()
for link in i.parent.find_all("a"):
href=url+link.get("href")
print href
while 1:
try:
article=getEachArticle(href)
break
except:
continue
duzhe[i.string].append(article)
return duzhe
def readDuZhe(duzhe):
for eachColumn in duzhe:
for eachArticle in duzhe[eachColumn]:
print eachArticle["title"]
if __name__ == '__main__':
# issue=raw_input("issue(201501):")
readDuZhe(getCatalog("201424"))
推荐阅读
热点文章
Discord.py(重写)on_member_update 无法正常工作
0
Discord.py 在 vc 中获取用户分钟数
0
discord.py 重写 |为我的命令出错
0
Discord.py rewrite 如何 DM 命令?
0
播放音频时,最后一部分被切断.如何解决这个问题?(discord.py)
0
在消息删除消息 Discord.py
0
如何使 discord.py 机器人私人/直接消息不是作者的人?
0
(Discord.py) 如何获取整个嵌入内容?
0
Discord bot 尽管获得了许可,但不能提及所有人
0
Discord.py discord.NotFound 异常
0
