1 #!/usr/bin/env python
2
3 import cStringIO #4 import formatter #5 from htmllib import HTMLParser #We use various classes in these modules for parsing HTML.
6 import httplib #We only need an exception from this module
7 import os #This provides various file system functions
8 import sys #We are just using argv for command-line arguments
9 import urllib #We only need the urlretrieve()function for downloading Web pages
10 import urlparse #We use the urlparse()and urljoin()functions for URL manipulation
11
12 classRetriever(object):13 __slots__ = ('url','file')14
15 def __init__(self,url):16 self.url, self.file =self.get_file(url)17
18 def get_file(self, url, default='index.html'):19 'Create usable local filename from URL'
20 parsed = urlparse.urlparse(url) #ParseResult(scheme='http', netloc='www.baidu.com', path='', params='', query='', fragment='')
21 host = parsed.netloc.split('@')[-1].split(':')[0] #'www.baidu.com'
22 filepath = '%s%s' % (host,parsed.path) #'www.baidu.com'
23 if not os.path.splitext(parsed.path)[1]: #''
24 filepath = os.path.join(filepath, default) #'www.baidu.com\index.html'
25 linkdir = os.path.dirname(filepath) #'www.baidu.com'
26 if not os.path.isdir(linkdir): #False
27 if os.path.exists(linkdir): #False
28 os.unlink(linkdir)29 os.makedirs(linkdir) #make a directory named by link directory on the hard disc
30 returnurl, filepath31
32 defdownload(self):33 'Download URL to specific name file'
34 try:35 retval =urllib.urlretrieve(self.url, self.file)36 except(IOError, httplib.InvalidURL) as e:37 retval = (('*** ERROR:bad URL "%s": %s' %(self.url,e)),)38 returnretval39
40 defparse_links(self):41 'Parse out the links found in downloaded HTML file'
42 f = open(self.file, 'r')43 data =f.read()44 f.close()45 parser =HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(cStringIO.StringIO())))46 parser.feed(data)47 parser.close()48 returnparser.anchorlist49
50 classCrawler(object):51 count = 0 #the number of objects downloaded from the internet
52
53 def __init__(self, url):54 self.q = [url] #a queue of links to download
55 self.seen = set() #a set containing all the links that we have seen(downloaded) already
56 parsed =urlparse.urlparse(url)57 host = parsed.netloc.split('@')[-1].split(':')[0]58 self.dom = '.'.join(host.split('.')[-2:]) #'b.a.i.d.u'
59
60 def get_page(self, url, media=False):61 'Download page & parse links, add to queue if nec'
62 r =Retriever(url)63 fname = r.download()[0] #'www.baidu.com\index.html'
64 if fname[0] == '*': #'w'
65 print fname, '… skipping parse'
66 return
67 Crawler.count += 1 #1
68 print 'n(', Crawler.count, ')' #(1)
69 print 'URL:', url #URL: http://www.baidu.com
70 print 'FILE:', fname #FILE: www.baidu.com\index.html
71 self.seen.add(url) #set(['http://www.baidu.com'])
72 ftype = os.path.splitext(fname)[1] #'.html'
73 if ftype not in ('.htm', '.html'): #False
74 return
75
76 for link inr.parse_links():77 if link.startswith('mailto:'): #False
78 print '… discarded, mailto link'
79 continue
80 if not media: #False
81 ftype = os.path.splitext(link)[1]82 if ftype in ('.mp3','.mp4','.m4v','.wav'):83 print '… discarded, media file'
84 continue
85 if not link.startswith('http://'): #False
86 link =urlparse.urljoin(url, link)87 print '*', link,88 if link not in self.seen: #True
89 if self.dom not in link: #False
90 print '… discarded, not in domain'
91 else:92 if link not inself.q:93 self.q.append(link)94 print '… new, added to Q'
95 else:96 print '… discarded, already in Q'
97 else:98 print '… discarded, already processed'
99
100 def go(self, media=False):101 'Process next page in queue (if any)'
102 whileself.q:103 url =self.q.pop()104 self.get_page(url, media)105
106 defmain():107 if len(sys.argv) > 1:108 url = sys.argv[1]109 else:110 try:111 url = raw_input('Enter starting URL:')112 except(KeyboardInterrupt, EOFError):113 url = ''
114 if noturl:115 return
116 if not url.startswith('http://') and not url.startswith('ftp://'):117 url = 'http://%s/' %url118 robot =Crawler(url)119 robot.go()120
121 if __name__ == '__main__':122 main()
神龙|纯净稳定代理IP免费测试>>>>>>>>天启|企业级代理IP免费测试>>>>>>>>IPIPGO|全球住宅代理IP免费测试