Refactoring and README updates

2026-07-12 07:00:10 +00:00 · 2020-02-05 21:12:01 -05:00
parent 584d362856
commit e1172ff8cd
4 changed files with 48 additions and 41 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.xls
 *.xlsx
 *.pyc
+!requirements.txt

 # Exclude Directories
 __pycache__/
--- a/README.md
+++ b/README.md
@@ -1,42 +1,40 @@
 # phpBB Forum Scraper
-Python-based scraper for phpBB forums.

-Code requires: 
+Python-based web scraper for phpBB forums. Project can be used as a template for building your own
+custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
+can contribute significant strain on web servers, so please throttle your request rates.

-1. Python scraping library, [Scrapy](http://scrapy.org/).

-2. Python HTML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
+## Requirements: 
+
+1. Python web scraping library, [Scrapy](http://scrapy.org/).   
+2. Python HTML/XML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).


 ## Scraper Output
-Scrapes the following information from forum posts: 

-	1. Username
+The phpBB.py spider scrapes the following information from forum posts:
+1. Username
+2. User Post Count
+3. Post Date & Time
+4. Post Text
+5. Quoted Text

-	2. User post count
-
-	3. Post date & time
-
-	4. Post text
-    
-    5. Quoted text
+If you need additional data scraped, you will have to create additional spiders or edit the existing spider.


-Edit `phpBB.py` and specify:
+## Edit `phpBB.py` and Specify:

 1. `allowed_domains`
-    
 2. `start_urls` 
-    
 3. `username` & `password`
-    
 4. `forum_login=False` or `forum_login=True`

-## Instructions:
-From within `/phpBB_scraper/`:
-
-`scrapy crawl phpBB` to launch the crawler.
-
-`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV.

+## Running the Scraper:
+```bash
+cd phpBB_scraper/
+scrapy crawl phpBB
+# scrapy crawl phpBB -o posts.csv
+```
 NOTE: Please adjust `settings.py` to throttle your requests.
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -4,26 +4,30 @@ import scrapy
 from bs4 import BeautifulSoup
 from scrapy.http import Request

+
 class PhpbbSpider(scrapy.Spider):
    
    name = 'phpBB'
-    #Domain only, no urls
+    # Domain only, no urls
    allowed_domains = ['']
    start_urls = ['']
    username = ''
    password = ''
-    # False if you dont need to login, true if you do.
+    # False if you don't need to login, True if you do.
    form_login = False
    
    def parse(self, response):
        # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
        if self.form_login:
-            formdata = {'username': self.username, 
-                        'password': self.password}
-            form_request = [scrapy.FormRequest.from_response(response,
+            formdata = {'username': self.username, 'password': self.password}
+            form_request = [
+                scrapy.FormRequest.from_response(
+                    response,
                    formdata=formdata,
                    callback=self.after_login,
-                                                             dont_click=True)]
+                    dont_click=True
+                )
+            ]
            yield form_request
            return
        else:
@@ -59,7 +63,7 @@ class PhpbbSpider(scrapy.Spider):
        soup = BeautifulSoup(string, 'lxml')
        block_quotes = soup.find_all('blockquote')
        for i, quote in enumerate(block_quotes):
-            block_quotes[i] = '<quote-%s>='%str(i+1) + quote.get_text()
+            block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
        return ''.join(block_quotes)
    
    def clean_text(self, string):
@@ -68,7 +72,7 @@ class PhpbbSpider(scrapy.Spider):
        soup = BeautifulSoup(string, 'lxml')
        for tag in tags:
            for i, item in enumerate(soup.find_all(tag)):
-                item.replaceWith('<reply-%s>='%str(i+1))
+                item.replaceWith('<reply-%s>=' % str(i + 1))
        return re.sub(r' +', r' ', soup.get_text())
      
    def parse_posts(self, response):
@@ -80,13 +84,15 @@ class PhpbbSpider(scrapy.Spider):
        post_quotes = [self.clean_quote(s) for s in post_texts]
        post_texts = [self.clean_text(s) for s in post_texts]

-        #YIELD POST DATA
+        # YIELD POST DATA
        for i in range(len(usernames)):
-            yield {'Username': usernames[i],
+            yield {
+                'Username': usernames[i],
                'PostCount': post_counts[i],
                'PostTime': post_times[i],
                'PostText': post_texts[i],
-                   'QuoteText': post_quotes[i]}
+                'QuoteText': post_quotes[i]
+            }
        
        # CLICK THROUGH NEXT PAGE
        next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+bs4
+scrapy