From c8c9b22128eb9154a14ad1177d9cfb7bd83fc320 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sass=20B=C3=A1lint?= Date: Sat, 11 Feb 2023 05:17:59 +0100 Subject: [PATCH 1/7] [add] example: download comments for a post (#965) * [add] example: download comments for a post * [fix] undo unintentionally deleted part --- README.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/README.md b/README.md index 4088e754..0e31a3ce 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,45 @@ Run `facebook-scraper --help` for more details on CLI usage. **Note:** If you get a `UnicodeEncodeError` try adding `--encoding utf-8`. +### Practical example: donwload comments of a post + +```python +""" +Download comments for a public Facebook post. +""" + +import facebook_scraper as fs + +# get POST_ID from the URL of the post which can have the following structure: +# https://www.facebook.com/USER/posts/POST_ID +# https://www.facebook.com/groups/GROUP_ID/posts/POST_ID +POST_ID = "pfbid02NsuAiBU9o1ouwBrw1vYAQ7khcVXvz8F8zMvkVat9UJ6uiwdgojgddQRLpXcVBqYbl" + +# number of comments to download -- set this to True to download all comments +MAX_COMMENTS = 100 + +# get the post (this gives a generator) +gen = fs.get_posts( + post_urls=[POST_ID], + options={"comments": MAX_COMMENTS, "progress": True} +) + +# take 1st element of the generator which is the post we requested +post = next(gen) + +# extract the comments part +comments = post['comments_full'] + +# process comments as you want... +for comment in comments: + + # e.g. ...print them + print(comment) + + # e.g. ...get the replies for them + for reply in comment['replies']: + print(' ', reply) +``` ## Post example From 30a1f958cab850de6c2b06040523b4ddec26422c Mon Sep 17 00:00:00 2001 From: Girvin Junod <68438200+girvinjunod@users.noreply.github.com> Date: Fri, 17 Feb 2023 12:28:49 +0700 Subject: [PATCH 2/7] fix regex for searching next page in hashtag scraping (#972) --- facebook_scraper/page_iterators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/page_iterators.py b/facebook_scraper/page_iterators.py index 5989ef80..cbe0b595 100644 --- a/facebook_scraper/page_iterators.py +++ b/facebook_scraper/page_iterators.py @@ -286,7 +286,7 @@ def get_next_page(self) -> Optional[URL]: class HashtagPageParser(PageParser): - cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?locale=[a-z_A-Z]+&cursor=[^"]+).*$') + cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?cursor=[^"]+).*$') def get_page(self) -> Page: return super()._get_page('article', 'article') From 43cb0e97ae49d03fcb7d206544907ae70f205921 Mon Sep 17 00:00:00 2001 From: R44CX <29802889+r44cx@users.noreply.github.com> Date: Mon, 20 Mar 2023 10:32:58 +0100 Subject: [PATCH 3/7] Replace chrome cookie export extension --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0e31a3ce..9d8c3217 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ We’re headed to PAX East 3/28-3/31 with new games - **youtube_dl**: bool, use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your environment. Default is False. - **post_urls**: list, URLs or post IDs to extract posts from. Alternative to fetching based on username. - **cookies**: One of: - - The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like [Get Cookies.txt (Chrome)](https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid?hl=en) or [Cookie Quick Manager (Firefox)](https://addons.mozilla.org/en-US/firefox/addon/cookie-quick-manager/). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't. + - The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) or [Cookie Quick Manager (Firefox)](https://addons.mozilla.org/en-US/firefox/addon/cookie-quick-manager/). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't. - A [CookieJar](https://docs.python.org/3.9/library/http.cookiejar.html#http.cookiejar.CookieJar) - A dictionary that can be converted to a CookieJar with [cookiejar_from_dict](https://2.python-requests.org/en/master/api/#requests.cookies.cookiejar_from_dict) - The string `"from_browser"` to try extract Facebook cookies from your browser From d0c74caa74f5c3801260157f70ae788a15e7487e Mon Sep 17 00:00:00 2001 From: Pierre Date: Mon, 30 Oct 2023 05:47:44 +0100 Subject: [PATCH 4/7] Add default for matching (#991) --- facebook_scraper/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/facebook_scraper/__init__.py b/facebook_scraper/__init__.py index c9eecdff..e030bb9b 100644 --- a/facebook_scraper/__init__.py +++ b/facebook_scraper/__init__.py @@ -466,7 +466,7 @@ def handle_pagination_url(url): output_file.write(",") match = None if post["text"]: - match = re.search(kwargs.get("matching"), post["text"], flags=re.IGNORECASE) + match = re.search(kwargs.get("matching", '.+'), post["text"], flags=re.IGNORECASE) if kwargs.get("not_matching") and re.search( kwargs.get("not_matching"), post["text"], flags=re.IGNORECASE ): From 5f101faa4739bc81054479d4a9be56f9b8c403e6 Mon Sep 17 00:00:00 2001 From: Pierre Date: Mon, 30 Oct 2023 05:52:07 +0100 Subject: [PATCH 5/7] Add example to use write_posts_to_csv (#992) --- README.md | 48 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 9d8c3217..4a7c7c47 100644 --- a/README.md +++ b/README.md @@ -10,10 +10,8 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) - Scrape Facebook public pages without an API key. Inspired by [twitter-scraper](https://github.com/kennethreitz/twitter-scraper). - ## Install To install the latest release from PyPI: @@ -42,7 +40,6 @@ The final step on the road to the Super Smash Bros We’re headed to PAX East 3/28-3/31 with new games ``` - ### Optional parameters *(For the `get_posts` function)*. @@ -158,26 +155,27 @@ for comment in comments: 'w3_fb_url': 'https://www.facebook.com/Nintendo/posts/2257188721032235'} ``` - ### Notes - There is no guarantee that every field will be extracted (they might be `None`). - Group posts may be missing some fields like `time` and `post_url`. - Group scraping may return only one page and not work on private groups. - If you scrape too much, Facebook might temporarily ban your IP. -- The vast majority of unique IDs on facebook (post IDs, video IDs, photo IDs, comment IDs, profile IDs, etc) can be appended to https://www.facebook.com/ to result in a redirect to the corresponding object. +- The vast majority of unique IDs on facebook (post IDs, video IDs, photo IDs, comment IDs, profile IDs, etc) can be appended to "https://www.facebook.com/" to result in a redirect to the corresponding object. - Some functions (such as extracting reactions) require you to be logged into Facebook (pass cookies). If something isn't working as expected, try pass cookies and see if that fixes it. ## Profiles -The `get_profile` function can extract information from a profile's about section. Pass in the account name or ID as the first parameter. +The `get_profile` function can extract information from a profile's about section. Pass in the account name or ID as the first parameter. Note that Facebook serves different information depending on whether you're logged in (cookies parameter), such as Date of birth and Gender. Usage: ```python from facebook_scraper import get_profile get_profile("zuck") # Or get_profile("zuck", cookies="cookies.txt") ``` + Outputs: + ```python {'About': "I'm trying to make the world a more open place.", 'Education': 'Harvard University\n' @@ -218,7 +216,7 @@ To extract friends, pass the argument `friends=True`, or to limit the amount of ## Group info -The `get_group_info` function can extract info about a group. Pass in the group name or ID as the first parameter. +The `get_group_info` function can extract info about a group. Pass in the group name or ID as the first parameter. Note that in order to see the list of admins, you need to be logged in (cookies parameter). Usage: @@ -243,6 +241,39 @@ Output: 'type': 'Public group'} ``` +## Write to a CSV file directly + +The library also provides a `write_posts_to_csv()` function that writes posts directly to the disk and is able to resume scraping from the address of the last page. It is very useful when scraping large pages as the data is saved continuously and scraping can be resumed in case of an error. Here is an example to fetch the posts of a group 100 pages at a time and save them in separate files. + +```python +import facebook_scraper as fs + +# Saves the first 100 pages +for i in range(1, 101): + fs.write_posts_to_csv( + group=GROUP_ID, # The method uses get_posts internally so you can use the same arguments and they will be passed along + page_limit=100, + timeout=60, + options={ + 'allow_extra_requests': False + }, + filename=f'./data/messages_{i}.csv', # Will throw an error if the file already exists + resume_file='next_page.txt', # Will save a link to the next page in this file after fetching it and use it when starting. + matching='.+', # A regex can be used to filter all the posts matching a certain pattern (here, we accept anything) + not_matching='^Warning', # And likewise those that don't fit a pattern (here, we filter out all posts starting with "Warning") + keys=[ + 'post_id', + 'text', + 'timestamp', + 'time', + 'user_id' + ], # List of the keys that should be saved for each post, will save all keys if not set + format='csv', # Output file format, can be csv or json, defaults to csv + days_limit=3650 # Number of days for the oldest post to fetch, defaults to 3650 + ) + +``` + ## To-Do - Async support @@ -250,7 +281,6 @@ Output: - ~~Profiles or post authors~~ (`get_profile()`) - ~~Comments~~ (with `options={'comments': True}`) - ## Alternatives and related projects - [facebook-post-scraper](https://github.com/brutalsavage/facebook-post-scraper). Has comments. Uses Selenium. @@ -259,6 +289,6 @@ Output: - [Unofficial APIs](https://github.com/Rolstenhouse/unofficial-apis). List of unofficial APIs for various services, none for Facebook for now, but might be worth to check in the future. - [major-scrapy-spiders](https://github.com/talhashraf/major-scrapy-spiders). Has a profile spider for Scrapy. - [facebook-page-post-scraper](https://github.com/minimaxir/facebook-page-post-scraper). Seems abandoned. - - [FBLYZE](https://github.com/isaacmg/fb_scraper). Fork (?). +- [FBLYZE](https://github.com/isaacmg/fb_scraper). Fork (?). - [RSSHub](https://github.com/DIYgod/RSSHub/blob/master/lib/routes/facebook/page.js). Generates an RSS feed from Facebook pages. - [RSS-Bridge](https://github.com/RSS-Bridge/rss-bridge/blob/master/bridges/FacebookBridge.php). Also generates RSS feeds from Facebook pages. From 567711fbab3e014504a1d4f33f882c2b29d71584 Mon Sep 17 00:00:00 2001 From: John Liu Date: Mon, 30 Oct 2023 12:53:20 +0800 Subject: [PATCH 6/7] Doc: add comment and reply data model to README.md (#1032) --- README.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/README.md b/README.md index 4a7c7c47..ebc42b70 100644 --- a/README.md +++ b/README.md @@ -163,6 +163,41 @@ for comment in comments: - If you scrape too much, Facebook might temporarily ban your IP. - The vast majority of unique IDs on facebook (post IDs, video IDs, photo IDs, comment IDs, profile IDs, etc) can be appended to "https://www.facebook.com/" to result in a redirect to the corresponding object. - Some functions (such as extracting reactions) require you to be logged into Facebook (pass cookies). If something isn't working as expected, try pass cookies and see if that fixes it. +- Reaction Categories (EN): [`like`, `love`, `haha`, `sorry`, `wow`, `angry`, `care`] + +## Comment & Reply example +```python +{'comment_id': '1417925635669547', + 'comment_url': 'https://facebook.com/1417925635669547', + 'commenter_id': '100009665948953', + 'commenter_url': 'https://facebook.com/tw0311?eav=AfZuEAOAat6KRX5WFplL0SNA4ZW78Z2O7W_sjwMApq67hZxXDwXh2WF2ezhICX1LCT4&fref=nf&rc=p&refid=52&__tn__=R&paipv=0', + 'commenter_name': 'someone', + 'commenter_meta': None, + 'comment_text': 'something', + 'comment_time': datetime.datetime(2023, 6, 23, 0, 0), + 'comment_image': 'https://scontent.ftpe8-2.fna.fbcdn.net/m1/v/t6/An_UvxJXg9tdnLU3Y5qjPi0200MLilhzPXUgxzGjQzUMaNcmjdZA6anyrngvkdub33NZzZhd51fpCAEzNHFhko5aKRFP5fS1w_lKwYrzcNLupv27.png?ccb=10-5&oh=00_AfCdlpCwAg-SHhniMQ16uElFHh-OG8kGGmLAzvOY5_WZgw&oe=64BE3279&_nc_sid=7da55a', + 'comment_reactors': [ + {'name': 'Tom', 'link': 'https://facebook.com/ryan.dwayne?eav=AfaxdKIITTXyZj4H-eanXQgoxzOa8Vag6XkGXXDisGzh_W74RYZSXxlFZBofR4jUIOg&fref=pb&paipv=0', 'type': 'like'}, + {'name': 'Macy', 'link': 'https://facebook.com/profile.php?id=100000112053053&eav=AfZ5iWlNN-EjjSwVNQl7E2HiVp25AUZMqfoPvLRZGnbUAQxuLeN8nl6xnnQTJB3uxDM&fref=pb&paipv=0', 'type': 'like'}], + 'comment_reactions': {'like': 2}, + 'comment_reaction_count': 2, + 'replies': [ + {'comment_id': '793761608817229', + 'comment_url': 'https://facebook.com/793761608817229', + 'commenter_id': '100022377272712', + 'commenter_url': 'https://facebook.com/brizanne.torres?eav=Afab9uP4ByIMn1xaYK0UDd1SRU8e5Zu7faKEx6qTzLKD2vp_bB1xLDGvTwEd6u8A7jY&fref=nf&rc=p&__tn__=R&paipv=0', + 'commenter_name': 'David', + 'commenter_meta': None, + 'comment_text': 'something', + 'comment_time': datetime.datetime(2023, 6, 23, 18, 0), + 'comment_image': None, + 'comment_reactors': [], + 'comment_reactions': {'love': 2}, + 'comment_reaction_count': None} + ] +} +``` + ## Profiles From fb8d7b5158e4c8347a1832f4fac6c6ffc793e36b Mon Sep 17 00:00:00 2001 From: Young8881 <169129609+Young8881@users.noreply.github.com> Date: Mon, 17 Jun 2024 06:08:51 -0400 Subject: [PATCH 7/7] Create python-package-conda.yml Gjc Signed-off-by: Young8881 <169129609+Young8881@users.noreply.github.com> --- .github/workflows/python-package-conda.yml | 34 ++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/python-package-conda.yml diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml new file mode 100644 index 00000000..f3586044 --- /dev/null +++ b/.github/workflows/python-package-conda.yml @@ -0,0 +1,34 @@ +name: Python Package using Conda + +on: [push] + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: '3.10' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + conda env update --file environment.yml --name base + - name: Lint with flake8 + run: | + conda install flake8 + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + conda install pytest + pytest