Refactoring and README updates

This commit is contained in:
Dascienz
2020-02-05 21:12:01 -05:00
parent 584d362856
commit e1172ff8cd
4 changed files with 48 additions and 41 deletions

1
.gitignore vendored
View File

@@ -7,6 +7,7 @@
*.xls *.xls
*.xlsx *.xlsx
*.pyc *.pyc
!requirements.txt
# Exclude Directories # Exclude Directories
__pycache__/ __pycache__/

View File

@@ -1,42 +1,40 @@
# phpBB Forum Scraper # phpBB Forum Scraper
Python-based scraper for phpBB forums.
Code requires: Python-based web scraper for phpBB forums. Project can be used as a template for building your own
custom Scrapy spiders or for one-off crawls on designated forums. Please keep in mind that aggressive crawls
can contribute significant strain on web servers, so please throttle your request rates.
1. Python scraping library, [Scrapy](http://scrapy.org/).
2. Python HTML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/). ## Requirements:
1. Python web scraping library, [Scrapy](http://scrapy.org/).
2. Python HTML/XML parsing library, [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
## Scraper Output ## Scraper Output
Scrapes the following information from forum posts:
The phpBB.py spider scrapes the following information from forum posts:
1. Username 1. Username
2. User Post Count
3. Post Date & Time
4. Post Text
5. Quoted Text
2. User post count If you need additional data scraped, you will have to create additional spiders or edit the existing spider.
3. Post date & time
4. Post text
5. Quoted text
Edit `phpBB.py` and specify: ## Edit `phpBB.py` and Specify:
1. `allowed_domains` 1. `allowed_domains`
2. `start_urls` 2. `start_urls`
3. `username` & `password` 3. `username` & `password`
4. `forum_login=False` or `forum_login=True` 4. `forum_login=False` or `forum_login=True`
## Instructions:
From within `/phpBB_scraper/`:
`scrapy crawl phpBB` to launch the crawler.
`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV.
## Running the Scraper:
```bash
cd phpBB_scraper/
scrapy crawl phpBB
# scrapy crawl phpBB -o posts.csv
```
NOTE: Please adjust `settings.py` to throttle your requests. NOTE: Please adjust `settings.py` to throttle your requests.

View File

@@ -4,6 +4,7 @@ import scrapy
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from scrapy.http import Request from scrapy.http import Request
class PhpbbSpider(scrapy.Spider): class PhpbbSpider(scrapy.Spider):
name = 'phpBB' name = 'phpBB'
@@ -12,18 +13,21 @@ class PhpbbSpider(scrapy.Spider):
start_urls = [''] start_urls = ['']
username = '' username = ''
password = '' password = ''
# False if you dont need to login, true if you do. # False if you don't need to login, True if you do.
form_login = False form_login = False
def parse(self, response): def parse(self, response):
# LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN
if self.form_login: if self.form_login:
formdata = {'username': self.username, formdata = {'username': self.username, 'password': self.password}
'password': self.password} form_request = [
form_request = [scrapy.FormRequest.from_response(response, scrapy.FormRequest.from_response(
response,
formdata=formdata, formdata=formdata,
callback=self.after_login, callback=self.after_login,
dont_click=True)] dont_click=True
)
]
yield form_request yield form_request
return return
else: else:
@@ -59,7 +63,7 @@ class PhpbbSpider(scrapy.Spider):
soup = BeautifulSoup(string, 'lxml') soup = BeautifulSoup(string, 'lxml')
block_quotes = soup.find_all('blockquote') block_quotes = soup.find_all('blockquote')
for i, quote in enumerate(block_quotes): for i, quote in enumerate(block_quotes):
block_quotes[i] = '<quote-%s>='%str(i+1) + quote.get_text() block_quotes[i] = '<quote-%s>=%s' % (str(i + 1), quote.get_text())
return ''.join(block_quotes) return ''.join(block_quotes)
def clean_text(self, string): def clean_text(self, string):
@@ -82,11 +86,13 @@ class PhpbbSpider(scrapy.Spider):
# YIELD POST DATA # YIELD POST DATA
for i in range(len(usernames)): for i in range(len(usernames)):
yield {'Username': usernames[i], yield {
'Username': usernames[i],
'PostCount': post_counts[i], 'PostCount': post_counts[i],
'PostTime': post_times[i], 'PostTime': post_times[i],
'PostText': post_texts[i], 'PostText': post_texts[i],
'QuoteText': post_quotes[i]} 'QuoteText': post_quotes[i]
}
# CLICK THROUGH NEXT PAGE # CLICK THROUGH NEXT PAGE
next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first() next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first()

2
requirements.txt Normal file
View File

@@ -0,0 +1,2 @@
bs4
scrapy