From 28185eb7be730f949d6ce18eea4a49a645285ed0 Mon Sep 17 00:00:00 2001 From: Dascienz Date: Sun, 18 Aug 2019 13:49:08 -0400 Subject: [PATCH] gitignore amendments, indentation and spacing --- .gitignore | 8 ++ README.md | 4 +- phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 80 ++++++++++---------- 3 files changed, 52 insertions(+), 40 deletions(-) diff --git a/.gitignore b/.gitignore index 712e48d..6783722 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,12 @@ +# Exclude Files *.DS_Store *.csv *.json *.txt +*.tsv +*.xls +*.xlsx + +# Exclude Directories +__pycache__/ +.ipynb_checkpoints/ \ No newline at end of file diff --git a/README.md b/README.md index 94e0f87..8fe4854 100644 --- a/README.md +++ b/README.md @@ -37,4 +37,6 @@ From within `/phpBB_scraper/`: `scrapy crawl phpBB` to launch the crawler. -`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV. \ No newline at end of file +`scrapy crawl phpBB -o posts.csv` to launch the crawler and save results to CSV. + +NOTE: Please adjust `settings.py` to throttle your requests. \ No newline at end of file diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py index 4605506..f7bebad 100644 --- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -14,76 +14,78 @@ class PhpbbSpider(scrapy.Spider): form_login = False def parse(self, response): - #LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN + # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN if self.form_login: - formdata = {'username':self.username,'password':self.password} + formdata = {'username': self.username, + 'password': self.password} form_request = [scrapy.FormRequest.from_response(response, - formdata=formdata, - callback=self.after_login, - dont_click=True)] + formdata=formdata, + callback=self.after_login, + dont_click=True)] return form_request else: - #REQUEST SUB-FORUM TITLE LINKS + # REQUEST SUB-FORUM TITLE LINKS links = response.xpath('//a[@class="forumtitle"]/@href').extract() for link in links: - yield scrapy.Request(response.urljoin(link),callback=self.parse_topics) + yield scrapy.Request(response.urljoin(link), callback=self.parse_topics) def after_login(self, response): - #CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS + # CHECK LOGIN SUCCESS BEFORE MAKING REQUESTS if b'authentication failed' in response.body: self.logger.error('Login failed.') return else: - #REQUEST SUB-FORUM TITLE LINKS + # REQUEST SUB-FORUM TITLE LINKS links = response.xpath('//a[@class="forumtitle"]/@href').extract() for link in links: - yield scrapy.Request(response.urljoin(link),callback=self.parse_topics) + yield scrapy.Request(response.urljoin(link), callback=self.parse_topics) def parse_topics(self, response): - #REQUEST TOPIC TITLE LINKS + # REQUEST TOPIC TITLE LINKS links = response.xpath('//a[@class="topictitle"]/@href').extract() for link in links: - yield scrapy.Request(response.urljoin(link),callback=self.parse_posts) + yield scrapy.Request(response.urljoin(link), callback=self.parse_posts) - #IF NEXT PAGE EXISTS, FOLLOW - Next = response.xpath("//li[@class='next']//a[@rel='next']/@href").extract_first() - if Next: - yield scrapy.Request(response.urljoin(Next),callback=self.parse_topics) + # IF NEXT PAGE EXISTS, FOLLOW + next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first() + if next_link: + yield scrapy.Request(response.urljoin(next_link), callback=self.parse_topics) def clean_quote(self, string): - #CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES - soup = BeautifulSoup(string,'lxml') - blockQuotes = soup.find_all('blockquote') - for i, quote in enumerate(blockQuotes): - blockQuotes[i] = '=' + str(i) + quote.get_text() - text = ''.join(blockQuotes) - return text + # CLEAN HTML TAGS FROM POST TEXT, MARK QUOTES + soup = BeautifulSoup(string, 'lxml') + block_quotes = soup.find_all('blockquote') + for i, quote in enumerate(block_quotes): + block_quotes[i] = '=' + str(i) + quote.get_text() + return ''.join(block_quotes) def clean_text(self, string): - #CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES + # CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES tags = ['blockquote'] - soup = BeautifulSoup(string,'lxml') + soup = BeautifulSoup(string, 'lxml') for tag in tags: for i, item in enumerate(soup.find_all(tag)): item.replaceWith('=' + str(i)) - text = re.sub(' +',' ',soup.get_text()) - return text + return re.sub(r' +', r' ', soup.get_text()) def parse_posts(self, response): - #COLLECT FORUM POST DATA + # COLLECT FORUM POST DATA usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() - postCounts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() - postTimes = response.xpath('//p[@class="author"]/text()').extract() - postTexts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() - postQuotes = [self.clean_quote(s) for s in postTexts] - postTexts = [self.clean_text(s) for s in postTexts] + post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() + post_times = response.xpath('//p[@class="author"]/text()').extract() + post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() + post_quotes = [self.clean_quote(s) for s in post_texts] + post_texts = [self.clean_text(s) for s in post_texts] #YIELD POST DATA for i in range(len(usernames)): - yield {'User':usernames[i],'Count':postCounts[i], - 'Time':postTimes[i],'Post Text':postTexts[i],'Quote Text':postQuotes[i]} + yield {'Username': usernames[i], + 'PostCount': post_counts[i], + 'PostTime': post_times[i], + 'PostText': post_texts[i], + 'QuoteText': post_quotes[i]} - #CLICK THROUGH NEXT PAGE - Next = response.xpath("//li[@class='next']//a[@rel='next']/@href").extract_first() - if Next: - yield scrapy.Request(response.urljoin(Next),callback=self.parse_posts) \ No newline at end of file + # CLICK THROUGH NEXT PAGE + next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first() + if next_link: + yield scrapy.Request(response.urljoin(next_link), callback=self.parse_posts) \ No newline at end of file