From 7b850d872d1af57a4a9bcce74f1ae353e39aaffc Mon Sep 17 00:00:00 2001 From: David Ascienzo Date: Thu, 28 May 2020 21:29:06 -0400 Subject: [PATCH 1/2] Additional support on xpaths for poor phpBB formatting --- phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 42 ++++++++++---------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py index 8144ac8..0235987 100644 --- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -6,9 +6,9 @@ from scrapy.http import Request # TODO: Please provide values for the following variables # Domains only, no urls -ALLOWED_DOMAINS = [''] +ALLOWED_DOMAINS = ['filosofiki.eu'] # Starting urls -START_URLS = [''] +START_URLS = ['http://www.filosofiki.eu/viewforum.php?f=67&sid=17575a200a6f183f559b696701c4ea20'] # Is login required? True or False. FORM_LOGIN = False # Login username @@ -31,6 +31,11 @@ class PhpbbSpider(scrapy.Spider): login_url = LOGIN_URL start_urls.insert(0, login_url) + username_xpath = '//p[contains(@class, "author")]//a[contains(@class, "username")]//text()' + post_count_xpath = '//dd[@class="profile-posts" or not(@class)]//a/text()' + post_time_xpath = '//div[@class="postbody"]//time/@datetime|//div[@class="postbody"]//p[@class="author"]/text()[2]' + post_text_xpath = '//div[@class="postbody"]//div[@class="content"]' + def parse(self, response): # LOGIN TO PHPBB BOARD AND CALL AFTER_LOGIN if self.form_login: @@ -78,7 +83,7 @@ class PhpbbSpider(scrapy.Spider): block_quotes = soup.find_all('blockquote') for i, quote in enumerate(block_quotes): block_quotes[i] = '=%s' % (str(i + 1), quote.get_text()) - return ''.join(block_quotes) + return ''.join(block_quotes).strip() def clean_text(self, string): # CLEAN HTML TAGS FROM POST TEXT, MARK REPLIES TO QUOTES @@ -87,27 +92,24 @@ class PhpbbSpider(scrapy.Spider): for tag in tags: for i, item in enumerate(soup.find_all(tag)): item.replaceWith('=' % str(i + 1)) - return re.sub(r' +', r' ', soup.get_text()) + return re.sub(r' +', r' ', soup.get_text()).strip() def parse_posts(self, response): # COLLECT FORUM POST DATA - usernames = response.xpath('//p[@class="author"]//a[@class="username"]//text()').extract() - post_counts = response.xpath('//dd[@class="profile-posts"]//a/text()').extract() - post_times = response.xpath('//div[@class="postbody"]//time/@datetime').extract() - post_texts = response.xpath('//div[@class="postbody"]//div[@class="content"]').extract() - post_quotes = [self.clean_quote(s) for s in post_texts] - post_texts = [self.clean_text(s) for s in post_texts] + usernames = response.xpath(self.username_xpath).extract() + n = len(usernames) + if n > 0: + post_counts = response.xpath(self.post_count_xpath).extract() or (n * ['']) + post_times = response.xpath(self.post_time_xpath).extract() or (n * ['']) + post_texts = response.xpath(self.post_text_xpath).extract() or (n * ['']) + post_quotes = [self.clean_quote(s) for s in post_texts] + post_texts = [self.clean_text(s) for s in post_texts] + + # YIELD POST DATA + for i in range(n): + yield {'Username': str(usernames[i]).strip(), 'PostCount': str(post_counts[i]).strip(), + 'PostTime': str(post_times[i]).strip(), 'PostText': post_texts[i], 'QuoteText': post_quotes[i]} - # YIELD POST DATA - for i in range(len(usernames)): - yield { - 'Username': usernames[i], - 'PostCount': post_counts[i], - 'PostTime': post_times[i], - 'PostText': post_texts[i], - 'QuoteText': post_quotes[i] - } - # CLICK THROUGH NEXT PAGE next_link = response.xpath('//li[@class="next"]//a[@rel="next"]/@href').extract_first() if next_link: From 12ad5b5f11cf4fe942696cc7e45756b0f04bcf53 Mon Sep 17 00:00:00 2001 From: David Ascienzo Date: Thu, 28 May 2020 21:29:52 -0400 Subject: [PATCH 2/2] Additional support on xpaths for poor phpBB formatting --- phpBB_scraper/phpBB_scraper/spiders/phpBB.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py index 0235987..aece38f 100644 --- a/phpBB_scraper/phpBB_scraper/spiders/phpBB.py +++ b/phpBB_scraper/phpBB_scraper/spiders/phpBB.py @@ -6,9 +6,9 @@ from scrapy.http import Request # TODO: Please provide values for the following variables # Domains only, no urls -ALLOWED_DOMAINS = ['filosofiki.eu'] +ALLOWED_DOMAINS = [''] # Starting urls -START_URLS = ['http://www.filosofiki.eu/viewforum.php?f=67&sid=17575a200a6f183f559b696701c4ea20'] +START_URLS = [''] # Is login required? True or False. FORM_LOGIN = False # Login username