最近在看《我当道士那些年》,小说写得不错,奈何仍未完结。
写了个脚本用来更新小说内容。
没有做添加的处理,每次都是直接完全更新一遍。有兴趣的书友可以修改下。
![]()
写了个脚本用来更新小说内容。
没有做添加的处理,每次都是直接完全更新一遍。有兴趣的书友可以修改下。
#!/usr/bin/python
#coding=utf-8
import urllib2
import HTMLParser
import re
url = "http://www.wodangdaoshinaxienian.com"
urlSeeds = []
print "####### preprocess " + url + " ######"
class app_url(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == "href" and value not in urlSeeds:
if value.startswith(url) and value != url:
urlSeeds.append(value)
#print "###### find seed: " + value + " ######"
fp = open(u"我当道士那些年.txt", 'w');
def SaveText(chapterUrl):
try:
req = urllib2.Request(chapterUrl)
content = urllib2.urlopen(req).read()
except URLError, e:
print e.reason
titleRe = r'<meta name="keywords" content="(.*)" />?'
contentRe = r'<div class="bookcontent clearfix" id="BookText">(.*)</div>?'
title = re.findall(titleRe, content)
content = re.findall(contentRe, content)
titleWrite = title[0] + '\n'
contentWrite = '\n'.join(content[0].split('<br/><br/>'))
print "Processing " + title[0] + "
"
fp.write(titleWrite)
fp.write(contentWrite)
lParser = app_url()
try:
req = urllib2.Request(url)
content = urllib2.urlopen(req).read()
except URLError, e:
print e.reason
lParser.feed(content)
lParser.close();
print "############ urls parsed
#######################"
for chapterUrl in urlSeeds:
SaveText(chapterUrl)
fp.close()
#coding=utf-8
import urllib2
import HTMLParser
import re
url = "http://www.wodangdaoshinaxienian.com"
urlSeeds = []
print "####### preprocess " + url + " ######"
class app_url(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name, value in attrs:
if name == "href" and value not in urlSeeds:
if value.startswith(url) and value != url:
urlSeeds.append(value)
#print "###### find seed: " + value + " ######"
fp = open(u"我当道士那些年.txt", 'w');
def SaveText(chapterUrl):
try:
req = urllib2.Request(chapterUrl)
content = urllib2.urlopen(req).read()
except URLError, e:
print e.reason
titleRe = r'<meta name="keywords" content="(.*)" />?'
contentRe = r'<div class="bookcontent clearfix" id="BookText">(.*)</div>?'
title = re.findall(titleRe, content)
content = re.findall(contentRe, content)
titleWrite = title[0] + '\n'
contentWrite = '\n'.join(content[0].split('<br/><br/>'))
print "Processing " + title[0] + "

fp.write(titleWrite)
fp.write(contentWrite)
lParser = app_url()
try:
req = urllib2.Request(url)
content = urllib2.urlopen(req).read()
except URLError, e:
print e.reason
lParser.feed(content)
lParser.close();
print "############ urls parsed

for chapterUrl in urlSeeds:
SaveText(chapterUrl)
fp.close()