commit 01fcfb586b17a7c65e48845ae6132f83d1e99078 Author: David Ascienzo Date: Sun Aug 19 15:47:44 2018 -0400 Uploading revised phpBB forum scraping code. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..712e48d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.DS_Store +*.csv +*.json +*.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..0614572 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# phpBB Forum Scraper +Python-based scraper for phpBB forums. + +Code requires: + + 1. Python scraping library, Scrapy. + + 2. Python HTML parsing library, BeautifulSoup. + + +## Scraper Output +Scrapes the following information from forum posts: + + 1. Username + + 2. User post count + + 3. Post date & time + + 4. Post text + + 5. Quoted text + + +allowed_domains = [''] + start_urls = [''] + username = '' + password = '' + form_login = False + +Edit `phpBB.py` and specify: + + 1. `allowed_domains` + + 2. `start_urls` + + 3. `username` & `password` + + 4. `forum_login=False` or `forum_login=True` + +## Instructions: +From within `/phpBB_scraper/`: + +`scrapy crawl phpBB` to launch the crawler. + +`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV. \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/__init__.py b/phpBB_scraper/phpBB_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..22e9ac3 Binary files /dev/null and b/phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc differ diff --git a/phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc new file mode 100644 index 0000000..ef9c43f Binary files /dev/null and b/phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc differ diff --git a/phpBB_scraper/phpBB_scraper/items.py b/phpBB_scraper/phpBB_scraper/items.py new file mode 100644 index 0000000..1c50336 --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class PhpbbScraperItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/phpBB_scraper/phpBB_scraper/middlewares.py b/phpBB_scraper/phpBB_scraper/middlewares.py new file mode 100644 index 0000000..da3310e --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/middlewares.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class PhpbbScraperSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/phpBB_scraper/phpBB_scraper/pipelines.py b/phpBB_scraper/phpBB_scraper/pipelines.py new file mode 100644 index 0000000..1fe2b54 --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class PhpbbScraperPipeline(object): + def process_item(self, item, spider): + return item diff --git a/phpBB_scraper/phpBB_scraper/settings.py b/phpBB_scraper/phpBB_scraper/settings.py new file mode 100644 index 0000000..2f9a98b --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for phpBB_scraper project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'phpBB_scraper' +SPIDER_MODULES = ['phpBB_scraper.spiders'] +NEWSPIDER_MODULE = 'phpBB_scraper.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 1.0 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = True + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'phpBB_scraper.middlewares.PhpbbScraperSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'phpBB_scraper.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'phpBB_scraper.pipelines.PhpbbScraperPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/spiders/__init__.py b/phpBB_scraper/phpBB_scraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..35e1f9f Binary files /dev/null and b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc differ diff --git a/phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc new file mode 100644 index 0000000..cdeb4be Binary files /dev/null and b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc differ diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py new file mode 100644 index 0000000..4605506 --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +import re +import scrapy +from bs4 import BeautifulSoup +from scrapy.http import Request + +class PhpbbSpider(scrapy.Spider): + + name = 'phpBB' + allowed_domains = [''] + start_urls = [''] + username = '' + password = '' + form_login = False + + def parse(self, response): + #LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN + if self.form_login: + formdata = {'username':self.username,'password':self.password} + form_request = [scrapy.FormRequest.from_response(response, + formdata=formdata, + callback=self.after_login, + dont_click=True)] + return form_request + else: + #REQUEST SUB-FORUM TITLE LINKS + links = response.xpath('//a[@class="forumtitle"]/@href').extract() + for link in links: + yield scrapy.Request(response.urljoin(link),callback=self.parse_topics) + + def after_login(self, response): + #CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS + if b'authentication failed' in response.body: + self.logger.error('Login failed.') + return + else: + #REQUEST SUB-FORUM TITLE LINKS + links = response.xpath('//a[@class="forumtitle"]/@href').extract() + for link in links: + yield scrapy.Request(response.urljoin(link),callback=self.parse_topics) + + def parse_topics(self, response): + #REQUEST TOPIC TITLE LINKS + links = response.xpath('//a[@class="topictitle"]/@href').extract() + for link in links: + yield scrapy.Request(response.urljoin(link),callback=self.parse_posts) + + #IF NEXT PAGE EXISTS, FOLLOW + Next = response.xpath("//li[@class='next']//a[@rel='next']/@href").extract_first() + if Next: + yield scrapy.Request(response.urljoin(Next),callback=self.parse_topics) + + def clean_quote(self, string): + #CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES + soup = BeautifulSoup(string,'lxml') + blockQuotes = soup.find_all('blockquote') + for i, quote in enumerate(blockQuotes): + blockQuotes[i] = '=' + str(i) + quote.get_text() + text = ''.join(blockQuotes) + return text + + def clean_text(self, string): + #CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES + tags = ['blockquote'] + soup = BeautifulSoup(string,'lxml') + for tag in tags: + for i, item in enumerate(soup.find_all(tag)): + item.replaceWith('=' + str(i)) + text = re.sub(' +',' ',soup.get_text()) + return text + + def parse_posts(self, response): + #COLLECT FORUM POST DATA + usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() + postCounts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() + postTimes = response.xpath('//p[@class="author"]/text()').extract() + postTexts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() + postQuotes = [self.clean_quote(s) for s in postTexts] + postTexts = [self.clean_text(s) for s in postTexts] + + #YIELD POST DATA + for i in range(len(usernames)): + yield {'User':usernames[i],'Count':postCounts[i], + 'Time':postTimes[i],'Post Text':postTexts[i],'Quote Text':postQuotes[i]} + + #CLICK THROUGH NEXT PAGE + Next = response.xpath("//li[@class='next']//a[@rel='next']/@href").extract_first() + if Next: + yield scrapy.Request(response.urljoin(Next),callback=self.parse_posts) \ No newline at end of file diff --git a/phpBB_scraper/scrapy.cfg b/phpBB_scraper/scrapy.cfg new file mode 100644 index 0000000..685c4b9 --- /dev/null +++ b/phpBB_scraper/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = phpBB_scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = phpBB_scraper