mirror of
https://github.com/NohamR/phpBB-forum-scraper.git
synced 2026-02-22 02:25:43 +00:00
Fixed QuoteText tags and added pyc files to gitignore
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -56,7 +56,7 @@ class PhpbbSpider(scrapy.Spider):
|
|||||||
soup = BeautifulSoup(string, 'lxml')
|
soup = BeautifulSoup(string, 'lxml')
|
||||||
block_quotes = soup.find_all('blockquote')
|
block_quotes = soup.find_all('blockquote')
|
||||||
for i, quote in enumerate(block_quotes):
|
for i, quote in enumerate(block_quotes):
|
||||||
block_quotes[i] = '<quote-%s>=' + str(i) + quote.get_text()
|
block_quotes[i] = '<quote-%s>='%str(i+1) + quote.get_text()
|
||||||
return ''.join(block_quotes)
|
return ''.join(block_quotes)
|
||||||
|
|
||||||
def clean_text(self, string):
|
def clean_text(self, string):
|
||||||
@@ -65,7 +65,7 @@ class PhpbbSpider(scrapy.Spider):
|
|||||||
soup = BeautifulSoup(string, 'lxml')
|
soup = BeautifulSoup(string, 'lxml')
|
||||||
for tag in tags:
|
for tag in tags:
|
||||||
for i, item in enumerate(soup.find_all(tag)):
|
for i, item in enumerate(soup.find_all(tag)):
|
||||||
item.replaceWith('<reply-%s>=' + str(i))
|
item.replaceWith('<reply-%s>='%str(i+1))
|
||||||
return re.sub(r' +', r' ', soup.get_text())
|
return re.sub(r' +', r' ', soup.get_text())
|
||||||
|
|
||||||
def parse_posts(self, response):
|
def parse_posts(self, response):
|
||||||
|
|||||||
Reference in New Issue
Block a user