-
Notifications
You must be signed in to change notification settings - Fork 91
Description
Thanks for putting this together! I'm attempting to run this code, but can only get as far as "Scraping FB Posts from Business Times." When I run the section, it prints 11 weblinks, but the last weblink leads to a missing page on Business Times. This causes the script to give up with an HTTP error (output below). Any guidance is much appreciated!
`...
14 Mar 2019 https://www.businesstimes.com.sg/technology/massive-outage-adds-to-growing-facebook-woes
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-faces-criminal-probe-of-data-deals-report
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-instagram-outage-spreads-to-users-around-the-globe
HTTPError Traceback (most recent call last)
in
18 try:
---> 19 link_page = urlopen(url).read()
20 except:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
HTTPError: HTTP Error 403: Forbidden
During handling of the above exception, another exception occurred:
HTTPError Traceback (most recent call last)
in
20 except:
21 url = url[:-2]
---> 22 link_page = urlopen(url).read()
23 link_soup = BeautifulSoup(link_page)
24 sentences = link_soup.findAll("p")
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result
/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 404: Not Found
`