-
Notifications
You must be signed in to change notification settings - Fork 97
/
Copy pathadvanced_link_crawler_using_requests.py
99 lines (89 loc) · 3.74 KB
/
advanced_link_crawler_using_requests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
from urllib import robotparser
from urllib.parse import urljoin
import requests
from chp1.throttle import Throttle
def download(url, num_retries=2, user_agent='wswp', proxies=None):
""" Download a given URL and return the page content
args:
url (str): URL
kwargs:
user_agent (str): user agent (default: wswp)
proxies (dict): proxy dict w/ keys 'http' and 'https', values
are strs (i.e. 'http(s)://IP') (default: None)
num_retries (int): # of retries if a 5xx error is seen (default: 2)
"""
print('Downloading:', url)
headers = {'User-Agent': user_agent}
try:
resp = requests.get(url, headers=headers, proxies=proxies)
html = resp.text
if resp.status_code >= 400:
print('Download error:', resp.text)
html = None
if num_retries and 500 <= resp.status_code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
except requests.exceptions.RequestException as e:
print('Download error:', e)
html = None
return html
def get_robots_parser(robots_url):
" Return the robots parser object using the robots_url "
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
def get_links(html):
""" Return a list of links (using simple regex matching)
from the html content """
# a regular expression to extract all links from the webpage
webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
proxies=None, delay=3, max_depth=4):
""" Crawl from the given start URL following links matched by link_regex.
In the current implementation, we do not actually scrape any information.
args:
start_url (str): web site to start crawl
link_regex (str): regex to match for links
kwargs:
robots_url (str): url of the site's robots.txt
(default: start_url + /robots.txt)
user_agent (str): user agent (default: wswp)
proxies (dict): proxy dict w/ keys 'http' and 'https', values
are strs (i.e. 'http(s)://IP') (default: None)
delay (int): seconds to throttle between requests
to one domain (default: 3)
max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
"""
crawl_queue = [start_url]
# keep track which URL's have seen before
seen = {}
if not robots_url:
robots_url = '{}/robots.txt'.format(start_url)
rp = get_robots_parser(robots_url)
throttle = Throttle(delay)
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
depth = seen.get(url, 0)
if depth == max_depth:
print('Skipping %s due to depth' % url)
continue
throttle.wait(url)
html = download(url, user_agent=user_agent, proxies=proxies)
if not html:
continue
# TODO: add actual data scraping here
# filter for links matching our regular expression
for link in get_links(html):
if re.match(link_regex, link):
abs_link = urljoin(start_url, link)
if abs_link not in seen:
seen[abs_link] = depth + 1
crawl_queue.append(abs_link)
else:
print('Blocked by robots.txt:', url)