python爬虫编程100例

799次阅读
没有评论
python爬虫编程100例

1 #!/usr/bin/env python

2

3 import cStringIO #4 import formatter #5 from htmllib import HTMLParser #We use various classes in these modules for parsing HTML.

6 import httplib #We only need an exception from this module

7 import os #This provides various file system functions

8 import sys #We are just using argv for command-line arguments

9 import urllib #We only need the urlretrieve()function for downloading Web pages

10 import urlparse #We use the urlparse()and urljoin()functions for URL manipulation

11

12 classRetriever(object):13 __slots__ = ('url','file')14

15 def __init__(self,url):16 self.url, self.file =self.get_file(url)17

18 def get_file(self, url, default='index.html'):19 'Create usable local filename from URL'

20 parsed = urlparse.urlparse(url) #ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='')

21 host = parsed.netloc.split('@')[-1].split(':')[0] #'www.baidu.com'

22 filepath = '%s%s' % (host,parsed.path) #'www.baidu.com'

23 if not os.path.splitext(parsed.path)[1]: #''

24 filepath = os.path.join(filepath, default) #'www.baidu.com\index.html'

25 linkdir = os.path.dirname(filepath) #'www.baidu.com'

26 if not os.path.isdir(linkdir): #False

27 if os.path.exists(linkdir): #False

28 os.unlink(linkdir)29 os.makedirs(linkdir) #make a directory named by link directory on the hard disc

30 returnurl, filepath31

32 defdownload(self):33 'Download URL to specific name file'

34 try:35 retval =urllib.urlretrieve(self.url, self.file)36 except(IOError, httplib.InvalidURL) as e:37 retval = (('*** ERROR:bad URL "%s": %s' %(self.url,e)),)38 returnretval39

40 defparse_links(self):41 'Parse out the links found in downloaded HTML file'

42 f = open(self.file, 'r')43 data =f.read()44 f.close()45 parser =HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))46 parser.feed(data)47 parser.close()48 returnparser.anchorlist49

50 classCrawler(object):51 count = 0 #the number of objects downloaded from the internet

52

53 def __init__(self, url):54 self.q = [url] #a queue of links to download

55 self.seen = set() #a set containing all the links that we have seen(downloaded) already

56 parsed =urlparse.urlparse(url)57 host = parsed.netloc.split('@')[-1].split(':')[0]58 self.dom = '.'.join(host.split('.')[-2:]) #'b.a.i.d.u'

59

60 def get_page(self, url, media=False):61 'Download page & parse links, add to queue if nec'

62 r =Retriever(url)63 fname = r.download()[0] #'www.baidu.com\index.html'

64 if fname[0] == '*': #'w'

65 print fname, '… skipping parse'

66 return

67 Crawler.count += 1 #1

68 print 'n(', Crawler.count, ')' #(1)

69 print 'URL:', url #URL: http://www.baidu.com

70 print 'FILE:', fname #FILE: www.baidu.com\index.html

71 self.seen.add(url) #set(['http://www.baidu.com'])

72 ftype = os.path.splitext(fname)[1] #'.html'

73 if ftype not in ('.htm', '.html'): #False

74 return

75

76 for link inr.parse_links():77 if link.startswith('mailto:'): #False

78 print '… discarded, mailto link'

79 continue

80 if not media: #False

81 ftype = os.path.splitext(link)[1]82 if ftype in ('.mp3','.mp4','.m4v','.wav'):83 print '… discarded, media file'

84 continue

85 if not link.startswith('http://'): #False

86 link =urlparse.urljoin(url, link)87 print '*', link,88 if link not in self.seen: #True

89 if self.dom not in link: #False

90 print '… discarded, not in domain'

91 else:92 if link not inself.q:93 self.q.append(link)94 print '… new, added to Q'

95 else:96 print '… discarded, already in Q'

97 else:98 print '… discarded, already processed'

99

100 def go(self, media=False):101 'Process next page in queue (if any)'

102 whileself.q:103 url =self.q.pop()104 self.get_page(url, media)105

106 defmain():107 if len(sys.argv) > 1:108 url = sys.argv[1]109 else:110 try:111 url = raw_input('Enter starting URL:')112 except(KeyboardInterrupt, EOFError):113 url = ''

114 if noturl:115 return

116 if not url.startswith('http://') and not url.startswith('ftp://'):117 url = 'http://%s/' %url118 robot =Crawler(url)119 robot.go()120

121 if __name__ == '__main__':122 main()

神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试

相关文章:

版权声明:Python教程2022-10-28发表,共计4155字。
新手QQ群:570568346,欢迎进群讨论 Python51学习