From 01fcfb586b17a7c65e48845ae6132f83d1e99078 Mon Sep 17 00:00:00 2001 From: David Ascienzo Date: Sun, 19 Aug 2018 15:47:44 -0400 Subject: [PATCH] Uploading revised phpBB forum scraping code. --- .gitignore | 4 + README.md | 46 +++++++++ phpBB_scraper/phpBB_scraper/__init__.py | 0 .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 167 bytes .../__pycache__/settings.cpython-36.pyc | Bin 0 -> 484 bytes phpBB_scraper/phpBB_scraper/items.py | 14 +++ phpBB_scraper/phpBB_scraper/middlewares.py | 56 +++++++++++ phpBB_scraper/phpBB_scraper/pipelines.py | 11 +++ phpBB_scraper/phpBB_scraper/settings.py | 89 ++++++++++++++++++ .../phpBB_scraper/spiders/__init__.py | 4 + .../__pycache__/__init__.cpython-36.pyc | Bin 0 -> 175 bytes .../spiders/__pycache__/phpBB.cpython-36.pyc | Bin 0 -> 3515 bytes phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 89 ++++++++++++++++++ phpBB_scraper/scrapy.cfg | 11 +++ 14 files changed, 324 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 phpBB_scraper/phpBB_scraper/__init__.py create mode 100644 phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc create mode 100644 phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc create mode 100644 phpBB_scraper/phpBB_scraper/items.py create mode 100644 phpBB_scraper/phpBB_scraper/middlewares.py create mode 100644 phpBB_scraper/phpBB_scraper/pipelines.py create mode 100644 phpBB_scraper/phpBB_scraper/settings.py create mode 100644 phpBB_scraper/phpBB_scraper/spiders/__init__.py create mode 100644 phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc create mode 100644 phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc create mode 100644 phpBB_scraper/phpBB_scraper/spiders/phpBB.py create mode 100644 phpBB_scraper/scrapy.cfg diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..712e48d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.DS_Store +*.csv +*.json +*.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..0614572 --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# phpBB Forum Scraper +Python-based scraper for phpBB forums. + +Code requires: + + 1. Python scraping library, Scrapy. + + 2. Python HTML parsing library, BeautifulSoup. + + +## Scraper Output +Scrapes the following information from forum posts: + + 1. Username + + 2. User post count + + 3. Post date & time + + 4. Post text + + 5. Quoted text + + +allowed_domains = [''] + start_urls = [''] + username = '' + password = '' + form_login = False + +Edit `phpBB.py` and specify: + + 1. `allowed_domains` + + 2. `start_urls` + + 3. `username` & `password` + + 4. `forum_login=False` or `forum_login=True` + +## Instructions: +From within `/phpBB_scraper/`: + +`scrapy crawl phpBB` to launch the crawler. + +`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV. \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/__init__.py b/phpBB_scraper/phpBB_scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..22e9ac3303235dfa7d2afc94659db3c6bbda0674 GIT binary patch literal 167 zcmXr!<>kuZ&xm9Ig2x~N1{i@12OutH0TL+;!3>&=ek&P@K*9*(SBQRSacWVqeoA6- za%O5?mA*@Aadt_5fqp=KQAt{UPG-KoTYgb#u7XQqNuqv1MuC%4d~tG7VnJ#V8ZSOR dGcU6wK3=b&@)n0pZhlH>PO2TqwqhV=006>lE0+KO literal 0 HcmV?d00001 diff --git a/phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/__pycache__/settings.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ef9c43f526e9428211f4ca35c2df7d39954fb85b GIT binary patch literal 484 zcmZ8e%}(1u5Vna45h9_N-d8;lsoKpCPF=O4wednx9B&jm3K${FWS6jtleN5#MAKK` zMR*sUz*k6{~bR(3;h#p511S6U%9yWGr8Q+$f>UIAyt9{~3riPtydMumujz z+wE__?+$1^wPb0$-I#e>SW11akS=Px+?V{}O55Kun)*KSXzCFNYBN(2! zcoYwaJN9r;e(>>RqgC#X12wl#u^*O=5mC$_JP%{y;ES^1l8NsT+l^i9*%#&hF(Lgk z9K_hS9S^$?T3%D4|KK@9W2w%7XdPZ literal 0 HcmV?d00001 diff --git a/phpBB_scraper/phpBB_scraper/items.py b/phpBB_scraper/phpBB_scraper/items.py new file mode 100644 index 0000000..1c50336 --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class PhpbbScraperItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/phpBB_scraper/phpBB_scraper/middlewares.py b/phpBB_scraper/phpBB_scraper/middlewares.py new file mode 100644 index 0000000..da3310e --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/middlewares.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class PhpbbScraperSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/phpBB_scraper/phpBB_scraper/pipelines.py b/phpBB_scraper/phpBB_scraper/pipelines.py new file mode 100644 index 0000000..1fe2b54 --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class PhpbbScraperPipeline(object): + def process_item(self, item, spider): + return item diff --git a/phpBB_scraper/phpBB_scraper/settings.py b/phpBB_scraper/phpBB_scraper/settings.py new file mode 100644 index 0000000..2f9a98b --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/settings.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for phpBB_scraper project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'phpBB_scraper' +SPIDER_MODULES = ['phpBB_scraper.spiders'] +NEWSPIDER_MODULE = 'phpBB_scraper.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.888' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 1.0 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +COOKIES_ENABLED = True + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'phpBB_scraper.middlewares.PhpbbScraperSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'phpBB_scraper.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'phpBB_scraper.pipelines.PhpbbScraperPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/spiders/__init__.py b/phpBB_scraper/phpBB_scraper/spiders/__init__.py new file mode 100644 index 0000000..ebd689a --- /dev/null +++ b/phpBB_scraper/phpBB_scraper/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..35e1f9feb8bbf5152100dcb699f982e1de201412 GIT binary patch literal 175 zcmXr!<>kuZ&xl;ez`*brh~a<{$Z`PUViq8g!Vt`$$>_I|p$H_5Ab!Q@hZd(673-%Y z7AI$>=2hvtq!wqFzGP>$~L_mF6nAB$g!V7i1JTImH(z7bO;?7NPNq k3o=tcCdJ2R=4F<|$LkeT-r}&y%}*)KNwovnS`5St0JOj_X#fBK literal 0 HcmV?d00001 diff --git a/phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc b/phpBB_scraper/phpBB_scraper/spiders/__pycache__/phpBB.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cdeb4be38cc7978b98dce9855e152e05c5691a34 GIT binary patch literal 3515 zcmbVPU2hx572Vk{E|-*StBLG5K&q;aZQ3z6O0+hVgG>>1U(7gCw_MxWQRwgmunBhPs)VBP+BRwON@xazck0pBvof&Nl{k5;k;C zt+2t_bED(_2Xh#`4pS?SlW0<=gGu(hn2gcT_%b=3B(jXD$Kq$Au!AI@!6XK)7n+=f z7B>%AXmbnJ;dbb9hr8bxSf#-|`@>JQ1NYJ{&)OJoQeFV&095(RAamEXThNfuF%bhsG zkVl*P_+?sV$>v_Lb10HQhY31x$5%d0)X$28G*_;SMKpe;T2Ik7zfH9=5XGo35;-n% znW$DYC=)Rs;GB%3@=!IBlTt)+sTvcJ4U04v7A9BiaU^8YFN<*+%Z{gPnPdayEv6C; zbPC&geNj_KIa!(?No7-}EX<}F^G|;_#Cz!f=Bwa$07M2nl5v{k)8I)WkFZ4WtPtg( z$kHOf?kA&-CjckVgy`#ICE~4`)TEJhYZu3M^nR~k6?p^3FkR*|kD1Jdb*)v@e0G&h zuPt$|tH+$Nh#2G`X%>Yja0U(-N3OVuy9a}~dCJJqG-oVsfipHZV}diL<_s7%bSkvZ z?9v5$Y>e}`4bC`~qdC)^bH?e(cd-JT@G!~CG>*!&$TtR2nk9U?@>{ymd9C~3(xIvB z{er(zF8FYe2<0R~6haUr0;*z&Yh+qv=!V4_Ove>JqVhXrXhUM1%ylvZqrSG<+@$I% z%m$LA2@TU{ZDyZidwm((C90hvOr$wOI7fx>7+I%;8pLuN##! zw5}T!%3H?q2jH`f8zRmFX{e#og94n4GUe}QMSOHTDavHp+S2)h59RhFg+79m8#)Lr z??VAqGs!2TL_}qx>=NfzHYri3h6mDCiZnlf1|e95Ej{lSG_O=FRgRuo+`)`*(N+0z zmPEPk`iPF&h9TDaY>lnMPS=;AI`4c26zMDxRs;1X6gWFz2qy%W3An72hnlio1dh8U zt1QsgJ_Vrx+B!gU2$UhDqH@*^5|871yTz zQd+&~CqXb?NW~^8dLa<1mqPgB6+o?YDR(>5+d;tj*`To~22lP7S}63`MZfnioqIk> zfJxFnpU(oBp$I?|EVsvaSs=)4?!^puKtJLT`9EpDB*j0#i)n9l#&ASj(kc6!d2Bzg z-3hOa2k3#fl#IMl$71DfH9VfKu4H7z^6_Xq~^CBH3)8;ckw?W*U z`dTgO^8XM7=1;Qbc`Q&?mgLG2QI24|fgQXZe}9a&PWwCngv|v{WatwU^bTd%q%v)D zQN%XqJr*L^)2=LjLaSVi-3vf;^rjWRM3T2)C~(w&Yi!N*P1o^VE5F|#6?~FWx!vy{Poix8#nzACE0Fl&>xB1tF^bY$Dqoh7DEoLKO5tNG z;(anCFgh%1!U?)tL0eu@^)u25qHhyP($mGmerfl%>0R?yy(^yYwR8{2-^Vlqy?R1x sU;exfJgxez+@=38>gTRo{}-YMUACU`w(28p6T2nF1rHC)Hf!780WF=' + str(i)) + text = re.sub(' +',' ',soup.get_text()) + return text + + def parse_posts(self, response): + #COLLECT FORUM POST DATA + usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() + postCounts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() + postTimes = response.xpath('//p[@class="author"]/text()').extract() + postTexts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() + postQuotes = [self.clean_quote(s) for s in postTexts] + postTexts = [self.clean_text(s) for s in postTexts] + + #YIELD POST DATA + for i in range(len(usernames)): + yield {'User':usernames[i],'Count':postCounts[i], + 'Time':postTimes[i],'Post Text':postTexts[i],'Quote Text':postQuotes[i]} + + #CLICK THROUGH NEXT PAGE + Next = response.xpath("//li[@class='next']//a[@rel='next']/@href").extract_first() + if Next: + yield scrapy.Request(response.urljoin(Next),callback=self.parse_posts) \ No newline at end of file diff --git a/phpBB_scraper/scrapy.cfg b/phpBB_scraper/scrapy.cfg new file mode 100644 index 0000000..685c4b9 --- /dev/null +++ b/phpBB_scraper/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = phpBB_scraper.settings + +[deploy] +#url = http://localhost:6800/ +project = phpBB_scraper