From e1172ff8cdfec6207611e4d5ec9e2e8b98cd17fc Mon Sep 17 00:00:00 2001
From: Dascienz <Dascienz@gmail.com>
Date: Wed, 5 Feb 2020 21:12:01 -0500
Subject: [PATCH] Refactoring and README updates

---
 .gitignore                                   |  1 +
 README.md                                    | 48 ++++++++++----------
 phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 38 +++++++++-------
 requirements.txt                             |  2 +
 4 files changed, 48 insertions(+), 41 deletions(-)
 create mode 100644 requirements.txt

diff --git a/.gitignore b/.gitignore
index 2970cae..5033f1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.xls
 *.xlsx
 *.pyc
+!requirements.txt
 
 # Exclude Directories
 __pycache__/
diff --git a/README.md b/README.md
index 8fe4854..0b173e9 100644
--- a/README.md
+++ b/README.md
@@ -1,42 +1,40 @@
 # phpBB Forum Scraper
-Python-based scraper for phpBB forums.
 
-Code requires: 
+Python-based web scraper for phpBB forums. Project can be used as a template for building your own
+custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
+can contribute significant strain on web servers, so please throttle your request rates.
 
-1. Python scraping library, [Scrapy](http://scrapy.org/).
-    
-2. Python HTML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
+
+## Requirements: 
+
+1. Python web scraping library, [Scrapy](http://scrapy.org/).   
+2. Python HTML/XML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
 
 
 ## Scraper Output
-Scrapes the following information from forum posts: 
 
-	1. Username
+The phpBB.py spider scrapes the following information from forum posts:
+1. Username
+2. User Post Count
+3. Post Date & Time
+4. Post Text
+5. Quoted Text
 
-	2. User post count
-
-	3. Post date & time
-
-	4. Post text
-    
-    5. Quoted text
+If you need additional data scraped, you will have to create additional spiders or edit the existing spider.
 
 
-Edit `phpBB.py` and specify:
+## Edit `phpBB.py` and Specify:
 
 1. `allowed_domains`
-    
-2. `start_urls`
-    
+2. `start_urls` 
 3. `username` & `password`
-    
 4. `forum_login=False` or `forum_login=True`
 
-## Instructions:
-From within `/phpBB_scraper/`:
-
-`scrapy crawl phpBB` to launch the crawler.
-
-`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV.
 
+## Running the Scraper:
+```bash
+cd phpBB_scraper/
+scrapy crawl phpBB
+# scrapy crawl phpBB -o posts.csv
+```
 NOTE: Please adjust `settings.py` to throttle your requests.
\ No newline at end of file
diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
index cdb713e..d398af5 100644
--- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
+++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py
@@ -4,26 +4,30 @@ import scrapy
 from bs4 import BeautifulSoup
 from scrapy.http import Request
 
+
 class PhpbbSpider(scrapy.Spider):
     
     name = 'phpBB'
-    #Domain only, no urls
+    # Domain only, no urls
     allowed_domains = ['']
     start_urls = ['']
     username = ''
     password = ''
-    # False if you dont need to login, true if you do.
+    # False if you don't need to login, True if you do.
     form_login = False
     
     def parse(self, response):
         # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
         if self.form_login:
-            formdata = {'username': self.username, 
-                        'password': self.password}
-            form_request = [scrapy.FormRequest.from_response(response,
-                                                             formdata=formdata,
-                                                             callback=self.after_login,
-                                                             dont_click=True)]
+            formdata = {'username': self.username, 'password': self.password}
+            form_request = [
+                scrapy.FormRequest.from_response(
+                    response,
+                    formdata=formdata,
+                    callback=self.after_login,
+                    dont_click=True
+                )
+            ]
             yield form_request
             return
         else:
@@ -59,7 +63,7 @@ class PhpbbSpider(scrapy.Spider):
         soup = BeautifulSoup(string, 'lxml')
         block_quotes = soup.find_all('blockquote')
         for i, quote in enumerate(block_quotes):
-            block_quotes[i] = '<quote-%s>='%str(i+1) + quote.get_text()
+            block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
         return ''.join(block_quotes)
     
     def clean_text(self, string):
@@ -68,7 +72,7 @@ class PhpbbSpider(scrapy.Spider):
         soup = BeautifulSoup(string, 'lxml')
         for tag in tags:
             for i, item in enumerate(soup.find_all(tag)):
-                item.replaceWith('<reply-%s>='%str(i+1))
+                item.replaceWith('<reply-%s>=' % str(i + 1))
         return re.sub(r' +', r' ', soup.get_text())
       
     def parse_posts(self, response):
@@ -80,13 +84,15 @@ class PhpbbSpider(scrapy.Spider):
         post_quotes = [self.clean_quote(s) for s in post_texts]
         post_texts = [self.clean_text(s) for s in post_texts]
 
-        #YIELD POST DATA
+        # YIELD POST DATA
         for i in range(len(usernames)):
-            yield {'Username': usernames[i],
-                   'PostCount': post_counts[i],
-                   'PostTime': post_times[i],
-                   'PostText': post_texts[i],
-                   'QuoteText': post_quotes[i]}
+            yield {
+                'Username': usernames[i],
+                'PostCount': post_counts[i],
+                'PostTime': post_times[i],
+                'PostText': post_texts[i],
+                'QuoteText': post_quotes[i]
+            }
         
         # CLICK THROUGH NEXT PAGE
         next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..38b44c0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+bs4
+scrapy
\ No newline at end of file