Merge pull request #4 from Dascienz/dev

Refactoring and README updates
2026-02-22 02:25:43 +00:00 · 2020-02-05 21:12:19 -05:00
parent 584d362856 e1172ff8cd
commit 29dd97420f
4 changed files with 48 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.xls
 *.xlsx
 *.pyc
 !requirements.txt
 # Exclude Directories
 __pycache__/
--- a/README.md
+++ b/README.md
@@ -1,42 +1,40 @@
 # phpBB Forum Scraper
 Python-based scraper for phpBB forums.
-Code requires: 
+Python-based web scraper for phpBB forums. Project can be used as a template for building your own
 custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
 can contribute significant strain on web servers, so please throttle your request rates.
 1. Python scraping library, [Scrapy](http://scrapy.org/).
-2. Python HTML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
+## Requirements: 
 1. Python web scraping library, [Scrapy](http://scrapy.org/).   
 2. Python HTML/XML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
 ## Scraper Output
 Scrapes the following information from forum posts: 
-	1. Username
+The phpBB.py spider scrapes the following information from forum posts:
 1. Username
 2. User Post Count
 3. Post Date & Time
 4. Post Text
 5. Quoted Text
-	2. User post count
+If you need additional data scraped, you will have to create additional spiders or edit the existing spider.
 	3. Post date & time
 	4. Post text
    5. Quoted text
-Edit `phpBB.py` and specify:
+## Edit `phpBB.py` and Specify:
 1. `allowed_domains`
 2. `start_urls` 
 3. `username` & `password`
 4. `forum_login=False` or `forum_login=True`
 ## Instructions:
 From within `/phpBB_scraper/`:
 `scrapy crawl phpBB` to launch the crawler.
 `scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV.
 ## Running the Scraper:
 ```bash
 cd phpBB_scraper/
 scrapy crawl phpBB
 # scrapy crawl phpBB -o posts.csv
 ```
 NOTE: Please adjust `settings.py` to throttle your requests.
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -4,26 +4,30 @@ import scrapy
 from bs4 import BeautifulSoup
 from scrapy.http import Request
 class PhpbbSpider(scrapy.Spider):
    name = 'phpBB'
-    #Domain only, no urls
+    # Domain only, no urls
    allowed_domains = ['']
    start_urls = ['']
    username = ''
    password = ''
-    # False if you dont need to login, true if you do.
+    # False if you don't need to login, True if you do.
    form_login = False
    def parse(self, response):
        # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
        if self.form_login:
-            formdata = {'username': self.username, 
+            formdata = {'username': self.username, 'password': self.password}
-                        'password': self.password}
+            form_request = [
-            form_request = [scrapy.FormRequest.from_response(response,
+                scrapy.FormRequest.from_response(
-                                                             formdata=formdata,
+                    response,
-                                                             callback=self.after_login,
+                    formdata=formdata,
-                                                             dont_click=True)]
+                    callback=self.after_login,
                    dont_click=True
                )
            ]
            yield form_request
            return
        else:
@@ -59,7 +63,7 @@ class PhpbbSpider(scrapy.Spider):
        soup = BeautifulSoup(string, 'lxml')
        block_quotes = soup.find_all('blockquote')
        for i, quote in enumerate(block_quotes):
-            block_quotes[i] = '<quote-%s>='%str(i+1) + quote.get_text()
+            block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
        return ''.join(block_quotes)
    def clean_text(self, string):
@@ -68,7 +72,7 @@ class PhpbbSpider(scrapy.Spider):
        soup = BeautifulSoup(string, 'lxml')
        for tag in tags:
            for i, item in enumerate(soup.find_all(tag)):
-                item.replaceWith('<reply-%s>='%str(i+1))
+                item.replaceWith('<reply-%s>=' % str(i + 1))
        return re.sub(r' +', r' ', soup.get_text())
    def parse_posts(self, response):
@@ -80,13 +84,15 @@ class PhpbbSpider(scrapy.Spider):
        post_quotes = [self.clean_quote(s) for s in post_texts]
        post_texts = [self.clean_text(s) for s in post_texts]
-        #YIELD POST DATA
+        # YIELD POST DATA
        for i in range(len(usernames)):
-            yield {'Username': usernames[i],
+            yield {
-                   'PostCount': post_counts[i],
+                'Username': usernames[i],
-                   'PostTime': post_times[i],
+                'PostCount': post_counts[i],
-                   'PostText': post_texts[i],
+                'PostTime': post_times[i],
-                   'QuoteText': post_quotes[i]}
+                'PostText': post_texts[i],
                'QuoteText': post_quotes[i]
            }
        # CLICK THROUGH NEXT PAGE
        next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 bs4
 scrapy