Python - Scraping myrecipes.com using Scrapy

hby01 Fri, 22 Jan 2016 07:47:30 -0800

I am trying to scrape recipe website: myrecipes.com to extract recipe 
details to be stored in an Sqlite db in my recipe android application. So 
far, I was able to obtain data regarding recipe ingredients, instructions, 
servings, nutrients etc... I'm trying to obtain data relating to the time 
it takes a recipe to be made. The issue is the not all recipes have the 
same time info., some might include: Total time or prep time or cook time 
but not always.


To deal with this I use the following code on the html snippet below:

 # Recipe time
        duration_nodes = sel.xpath('//div[@class = "panel-pane 
pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = 
"pane-content"]/div[@class = "field-collection-container clearfix"]')

        for duration_node in duration_nodes:
            try:
                path = duration_node.xpath('//div[@class = "panel-pane 
pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = 
"pane-content"]/div/div[@class = 
"field-recipe-time"]/div/div/span[1]/text()').extract()
                if path == 'Prep: ':
                    recipe['prep_time'] = duration_node.xpath('//div[@class 
= "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Cook: ':
                    recipe['cook_time'] = duration_node.xpath('//div[@class 
= "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Total: ':
                    recipe['total_time'] = 
duration_node.xpath('//div[@class = 
"field-recipe-time"]/div/div/span[2]/text()').extract()
            except:
                continue
     
        


<div class="panel-pane pane-entity-field pane-node-field-recipe-time 
recipe-time">
  
        <h2 class="pane-title">Recipe Time</h2>
    
  
  <div class="pane-content">
    <div class="field-collection-container clearfix">
  <div class="field-recipe-time">
    <div class="field-collection-view clearfix view-mode-recipe-time">
<div class="recipe-time-info">
  <span class="recipe-time-text">Prep: </span>
  <span class="recipe-time-duration">25 Minutes</span>
</div>
</div>  </div>
  <div class="field-recipe-time">
    <div class="field-collection-view clearfix view-mode-recipe-time 
field-collection-view-final">
<div class="recipe-time-info">
  <span class="recipe-time-text">Cook: </span>
  <span class="recipe-time-duration">45 Minutes</span>
</div>
</div>  </div>
</div>  </div>

  
  </div>

The full code is:
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and 
manage
# your spiders.

from scrapy.selector import Selector
from scrapy.spiders import CrawlSpider, Rule
#from scrapy.

from myrecipes.items import MyrecipesRecipe, Ingredient, Nutrients

class MyrecipesSpider(CrawlSpider):
    name = "myrecipes" # name of the spider to be used when crawling
    allowed_domains = ["myrecipes.com"] # where the spider is allowed to go
    start_urls = 
["http://www.myrecipes.com/recipe/indian-chickpea-vegetable-stew";]

    def parse(self, response):
        sel = Selector(response) # the selector
        recipe = MyrecipesRecipe()

        # Name
        recipe['name'] = sel.xpath("substring-before(//title/text(),' 
Recipe')").extract()

        # Cuisine
        recipe['cuisine'] = "Indian"
        
        # Ingredients
        ingredients = []
        ingredient_nodes = sel.xpath('//*[@class = "panel-pane 
pane-entity-field pane-node-field-ingredients"]/div/div')

        for ingredient_node in ingredient_nodes:
            try:
                name = ingredient_node.xpath('//div[@class = 
"field-ingredients"]/div/div/span[@itemprop = "name"]/text()').extract()
                quantity = ingredient_node.xpath('//div[@class = 
"field-ingredients"]/div/div/span[@itemprop = "amount"]/text()').extract()
            except:
                continue

            ingredient = Ingredient()
            ingredient['name'] = name
            ingredient['quantity'] = quantity
            ingredients.append(ingredient)

        recipe['ingredients'] = ingredients
        
        # Directions
        instructions = []
        instruction_nodes = sel.xpath('//div[@itemprop = 
"instructions"]/div[@class = "field-instructions"]/div/div[@class = 
"field-item even"]')

        for instruction_node in instruction_nodes:
            try:
                instruction_step = instruction_node.xpath('//div[@itemprop 
= "instructions"]/div[@class = "field-instructions"]/div/div[@class = 
"field-item even"]/*/text()').extract()
            except:
                continue
            instructions.append(instruction_step)
        
        recipe['instructions'] = instructions
        
        # Nutritional Info
        nutrients = []
        nutrient_nodes = sel.xpath('//div[@class = "panel-pane 
pane-entity-field pane-node-field-nutrition-data"]/div/div[@itemprop = 
"nutrition"]')

        for nutrient_node in nutrient_nodes:
            try:
                name = nutrient_node.xpath('//div[@class = 
"field-nutrition-data"]/div[contains (@class, "field-collection-view 
clearfix view-mode-recipe-nutrition")]/div/text()').extract()
                quantity = nutrient_node.xpath('//div[@class = 
"field-nutrition-data"]/div[contains(@class, "field-collection-view 
clearfix view-mode-recipe-nutrition")]/div/span/text()').extract()
            except:
                continue
            
            nutrient = Nutrients()
            nutrient['name'] =  name
            nutrient['quantity'] = quantity
            nutrients.append(nutrient)
        nutrient_name = []
        x = nutrients[0].get('name')
        for i in x:
            if i != "\n":
                nutrient_name.append(i)
        nutrients[0]['name'] = nutrient_name
        
        recipe['nutrients'] = nutrients
        
        # Recipe time
        duration_nodes = sel.xpath('//div[@class = "panel-pane 
pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = 
"pane-content"]/div[@class = "field-collection-container clearfix"]')

        for duration_node in duration_nodes:
            try:
                path = duration_node.xpath('//div[@class = "panel-pane 
pane-entity-field pane-node-field-recipe-time recipe-time"]/div[@class = 
"pane-content"]/div/div[@class = 
"field-recipe-time"]/div/div/span[1]/text()').extract()
                if path == 'Prep: ':
                    recipe['prep_time'] = duration_node.xpath('//div[@class 
= "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Cook: ':
                    recipe['cook_time'] = duration_node.xpath('//div[@class 
= "field-recipe-time"]/div/div/span[2]/text()').extract()
                elif path == 'Total: ':
                    recipe['total_time'] = 
duration_node.xpath('//div[@class = 
"field-recipe-time"]/div/div/span[2]/text()').extract()
            except:
                continue
     
        
        # Number of Servings
        recipe['servings'] = sel.xpath("substring-after(//div[@class = 
'panel-pane pane-entity-field pane-node-field-yield']/div[@class = 
'pane-content']/div[@itemprop = 'yield']/div[@class = 
'field-yield']/text(), ': ')").extract()

        return recipe



When I run the code, the output does not register and I can't figure why. I 
suspect the issue is with the if-else statements after testing. Any help 
would be appreciated.



-- 
You received this message because you are subscribed to the Google Groups 
"scrapy-users" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to scrapy-users+unsubscr...@googlegroups.com.
To post to this group, send email to scrapy-users@googlegroups.com.
Visit this group at https://groups.google.com/group/scrapy-users.
For more options, visit https://groups.google.com/d/optout.

Python - Scraping myrecipes.com using Scrapy

Reply via email to