From e1172ff8cdfec6207611e4d5ec9e2e8b98cd17fc Mon Sep 17 00:00:00 2001 From: Dascienz Date: Wed, 5 Feb 2020 21:12:01 -0500 Subject: [PATCH] Refactoring and README updates --- .gitignore | 1 + README.md | 48 ++++++++++---------- phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 38 +++++++++------- requirements.txt | 2 + 4 files changed, 48 insertions(+), 41 deletions(-) create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index 2970cae..5033f1f 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.xls *.xlsx *.pyc +!requirements.txt # Exclude Directories __pycache__/ diff --git a/README.md b/README.md index 8fe4854..0b173e9 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,40 @@ # phpBB Forum Scraper -Python-based scraper for phpBB forums. -Code requires: +Python-based web scraper for phpBB forums. Project can be used as a template for building your own +custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls +can contribute significant strain on web servers, so please throttle your request rates. -1. Python scraping library, [Scrapy](http://scrapy.org/). - -2. Python HTML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/). + +## Requirements: + +1. Python web scraping library, [Scrapy](http://scrapy.org/). +2. Python HTML/XML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/). ## Scraper Output -Scrapes the following information from forum posts: - 1. Username +The phpBB.py spider scrapes the following information from forum posts: +1. Username +2. User Post Count +3. Post Date & Time +4. Post Text +5. Quoted Text - 2. User post count - - 3. Post date & time - - 4. Post text - - 5. Quoted text +If you need additional data scraped, you will have to create additional spiders or edit the existing spider. -Edit `phpBB.py` and specify: +## Edit `phpBB.py` and Specify: 1. `allowed_domains` - -2. `start_urls` - +2. `start_urls` 3. `username` & `password` - 4. `forum_login=False` or `forum_login=True` -## Instructions: -From within `/phpBB_scraper/`: - -`scrapy crawl phpBB` to launch the crawler. - -`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV. +## Running the Scraper: +```bash +cd phpBB_scraper/ +scrapy crawl phpBB +# scrapy crawl phpBB -o posts.csv +``` NOTE: Please adjust `settings.py` to throttle your requests. \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py index cdb713e..d398af5 100644 --- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -4,26 +4,30 @@ import scrapy from bs4 import BeautifulSoup from scrapy.http import Request + class PhpbbSpider(scrapy.Spider): name = 'phpBB' - #Domain only, no urls + # Domain only, no urls allowed_domains = [''] start_urls = [''] username = '' password = '' - # False if you dont need to login, true if you do. + # False if you don't need to login, True if you do. form_login = False def parse(self, response): # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN if self.form_login: - formdata = {'username': self.username, - 'password': self.password} - form_request = [scrapy.FormRequest.from_response(response, - formdata=formdata, - callback=self.after_login, - dont_click=True)] + formdata = {'username': self.username, 'password': self.password} + form_request = [ + scrapy.FormRequest.from_response( + response, + formdata=formdata, + callback=self.after_login, + dont_click=True + ) + ] yield form_request return else: @@ -59,7 +63,7 @@ class PhpbbSpider(scrapy.Spider): soup = BeautifulSoup(string, 'lxml') block_quotes = soup.find_all('blockquote') for i, quote in enumerate(block_quotes): - block_quotes[i] = '='%str(i+1) + quote.get_text() + block_quotes[i] = '=%s' % (str(i + 1), quote.get_text()) return ''.join(block_quotes) def clean_text(self, string): @@ -68,7 +72,7 @@ class PhpbbSpider(scrapy.Spider): soup = BeautifulSoup(string, 'lxml') for tag in tags: for i, item in enumerate(soup.find_all(tag)): - item.replaceWith('='%str(i+1)) + item.replaceWith('=' % str(i + 1)) return re.sub(r' +', r' ', soup.get_text()) def parse_posts(self, response): @@ -80,13 +84,15 @@ class PhpbbSpider(scrapy.Spider): post_quotes = [self.clean_quote(s) for s in post_texts] post_texts = [self.clean_text(s) for s in post_texts] - #YIELD POST DATA + # YIELD POST DATA for i in range(len(usernames)): - yield {'Username': usernames[i], - 'PostCount': post_counts[i], - 'PostTime': post_times[i], - 'PostText': post_texts[i], - 'QuoteText': post_quotes[i]} + yield { + 'Username': usernames[i], + 'PostCount': post_counts[i], + 'PostTime': post_times[i], + 'PostText': post_texts[i], + 'QuoteText': post_quotes[i] + } # CLICK THROUGH NEXT PAGE next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..38b44c0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +bs4 +scrapy \ No newline at end of file