Python Reddit Scraper
This is a little Python script that allows you to scrape comments from a subbreddit on reddit.com.
NOTE: insert the forum name in line 35.
import scrapy from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors import LinkExtractor from scrapy.selector import Selector from reddit.items import CommentItem from scrapy.contrib.loader import ItemLoader from scrapy.contrib.loader.processor import MapCompose, TakeFirst, Compose, Join, Identity from reddit import utils def get_comments_count(x): return utils.regex_extractor(r"(.*)\s+comment.*", x, 1) def get_upvoted(x): return utils.regex_extractor(r"\((.*)\s+upvoted", x, 1) class CommentItemLoader(ItemLoader): default_item_class = CommentItem default_input_processor = MapCompose(lambda x: x.strip()) default_output_processor = Compose(TakeFirst(), lambda x: x.strip()) default_selector_class = Selector textpost_out = Compose(Join(" "), lambda x: x.strip()) comments_out = Compose(TakeFirst(), get_comments_count, lambda x: x.strip()) upvoted_out = Compose(TakeFirst(), get_upvoted, lambda x: x.strip()) comment_out = Compose(Join(" "), lambda x: x.strip()) class ThreadsSpider(CrawlSpider): name = "threads" allowed_domains = ["reddit.com"] start_urls = ( "https://www.reddit.com/r/[INSERT FORUM HERE]", ) rules = ( Rule(LinkExtractor(restrict_xpaths=(".//div[@class='nav-buttons']")), follow=True), Rule(LinkExtractor(restrict_xpaths=(".//div[@class='content']//p[@class='parent']/a[@class='title']")), callback="parse_item"), ) def parse_item(self, response): hxs = Selector(response) thread = hxs.xpath(".//p[@class='title']/a/text()").extract() op = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract() thread_date = hxs.xpath(".//div[contains(@class, 'self')]//p[@class='tagline']/time/@title").extract() textpost = hxs.xpath(".//div[contains(@class, 'self')]//div[@class='md']//text()").extract() comments = hxs.xpath(".//div[contains(@class, 'self')]//a[contains(@class, 'comments')]/text()").extract() vote_points = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/span[@class='number']/text()").extract() upvoted = hxs.xpath(".//div[@class='linkinfo']/div[@class='score']/text()").extract() rows = hxs.xpath(".//div[@class='commentarea']//div[contains(@class, 'comment')]/div[contains(@class, 'entry')]") for row in rows: l = CommentItemLoader(item = CommentItem(), response = response) l.add_value("url", response.url) l.add_value("thread", thread) l.add_value("op", op) l.add_value("thread_date", thread_date) l.add_value("textpost", textpost) l.add_value("comments", comments) l.add_value("vote_points", vote_points) l.add_value("upvoted", upvoted) l.add_value("comment", row.xpath(".//div[contains(@class, 'usertext-body')]//text()").extract()) l.add_value("user", row.xpath(".//p[@class='tagline']/a[contains(@class, 'author')]/text()").extract()) l.add_value("time", row.xpath(".//p[@class='tagline']/time/@title").extract()) yield l.load_item()