Re: How to crawl Youtube data recursively?

JoeJoe Sat, 04 Mar 2017 09:43:19 -0800


-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to scrapy-users+unsubscr...@googlegroups.com.
To post to this group, send email to scrapy-users@googlegroups.com.
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item
from scrapy.spiders import BaseSpider
#from scrapy import log
from scrapy.cmdline import execute
from scrapy.utils.markup import remove_tags
from scrapy.selector import Selector
from scrapy.http import HtmlResponse


import time

#define spider
class YoutubeSpider(scrapy.Spider):
	name = 'youtube_spider'
	#allowed_domains = ["https://www.youtube.com";]
	start_urls = ['https://www.youtube.com/watch?v=TPYPmsnm44Q']
	#time.sleep(10)

	custom_settings = {
        'DEPTH_LIMIT': '100',
	}
#selectors
	def parse(self, response):
		SET_SELECTOR = '#page-container'
		#NEW_SELECTOR = '#watch-header'
		for youtube in response.css(SET_SELECTOR,):


			TITLE_SELECTOR = 'meta ::attr(content)'
			LINK_SELECTOR = 'link ::attr(href)'
			VIEWCOUNT_SELECTOR = '//*[@id="watch7-views-info"]/div[1]/text()'
			DATEPUB_SELECTOR = '//*[@id="watch7-content"]/meta[14]/@content'
			GENRE_SELECTOR = '//*[@id="watch7-content"]/meta[15]/@content'
			NAME_SELECTOR = '//*[@id="watch7-user-header"]/div/a/text()'
			SUBS_SELECTOR = '//*[@id="watch7-subscription-container"]/span/span[1]/text()'
			LIKES_SELECTOR = '//*[@id="watch8-sentiment-actions"]/span/span[1]/button/span/text()'
			DISLIKES_SELECTOR = '//*[@id="watch8-sentiment-actions"]/span/span[3]/button/span/text()'
			#COMMENT_SELECTOR = '//*[@id="comment-section-renderer"]/text()'
			yield {
				'Video Title': youtube.css(TITLE_SELECTOR).extract_first(),
				'Video Link': youtube.css(LINK_SELECTOR).extract_first(),
				'Number of Views': youtube.xpath(VIEWCOUNT_SELECTOR).extract()[0].replace("views", ""),
				'Date Uploaded': youtube.xpath(DATEPUB_SELECTOR).extract_first(),
				'Video Category': youtube.xpath(GENRE_SELECTOR).extract_first(),
				'User Name': youtube.xpath(NAME_SELECTOR).extract_first(),
				'Number of Subscribers': youtube.xpath(SUBS_SELECTOR).extract_first(),
				'Likes Received': youtube.xpath(LIKES_SELECTOR).extract_first(),
				'Dislikes Received': youtube.xpath(DISLIKES_SELECTOR).extract_first(),
				#'Number of Comments': youtube.xpath(COMMENT_SELECTOR).extract_first(),
			}
	#get video links in webpage		
			RELATED_SELECTOR = '#watch-related a ::attr(href)'
			next_article = response.css(RELATED_SELECTOR).extract_first()
			if next_article:
				yield scrapy.Request(
					response.urljoin(next_article),
					callback=self.parse, #dont_filter=True 
					)

Re: How to crawl Youtube data recursively?

Reply via email to