Skip to content

Script stops at "Scraping FB Posts..." with HTTPError #1

@vrl2

Description

@vrl2

Thanks for putting this together! I'm attempting to run this code, but can only get as far as "Scraping FB Posts from Business Times." When I run the section, it prints 11 weblinks, but the last weblink leads to a missing page on Business Times. This causes the script to give up with an HTTP error (output below). Any guidance is much appreciated!

`...
14 Mar 2019 https://www.businesstimes.com.sg/technology/massive-outage-adds-to-growing-facebook-woes
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-faces-criminal-probe-of-data-deals-report
14 Mar 2019 https://www.businesstimes.com.sg/technology/facebook-instagram-outage-spreads-to-users-around-the-globe

HTTPError Traceback (most recent call last)
in
18 try:
---> 19 link_page = urlopen(url).read()
20 except:

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651

HTTPError: HTTP Error 403: Forbidden

During handling of the above exception, another exception occurred:

HTTPError Traceback (most recent call last)
in
20 except:
21 url = url[:-2]
---> 22 link_page = urlopen(url).read()
23 link_soup = BeautifulSoup(link_page)
24 sentences = link_soup.findAll("p")

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
221 else:
222 opener = _opener
--> 223 return opener.open(url, data, timeout)
224
225 def install_opener(opener):

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
530 for processor in self.process_response.get(protocol, []):
531 meth = getattr(processor, meth_name)
--> 532 response = meth(req, response)
533
534 return response

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_response(self, request, response)
640 if not (200 <= code < 300):
641 response = self.parent.error(
--> 642 'http', request, response, code, msg, hdrs)
643
644 return response

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in error(self, proto, *args)
568 if http_err:
569 args = (dict, 'default', 'http_error_default') + orig_args
--> 570 return self._call_chain(*args)
571
572 # XXX probably also want an abstract factory that knows when it makes

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
502 for handler in handlers:
503 func = getattr(handler, meth_name)
--> 504 result = func(*args)
505 if result is not None:
506 return result

/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
648 class HTTPDefaultErrorHandler(BaseHandler):
649 def http_error_default(self, req, fp, code, msg, hdrs):
--> 650 raise HTTPError(req.full_url, code, msg, hdrs, fp)
651
652 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found
`

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions