htmlparser bug? Help with my web scraper

Salient Fri, 21 Aug 2020 21:00:55 -0700

Hey guys,

I think the parseHtml() proc in the htmlparser module may have a bug in it. So 
I'm building a web crawler for scraping the Organic results in a Google Search 
Results Page and I ran into an issue sometimes when scraping. Example (for 
those of you in the United States at least), if you search for "seo consultant" 
with the code below you'll get 10 links corresponding to the 10 organic 
results. That's perfect. But if you search for "nike" you'll only get 4 out of 
the 6 (sometimes there are 7 results, it fluctuates).


I found where the issue is, apparently when I use the parseHtml() proc when 
searching for "nike" it's only returning 4 out of the 6 <class="g"> that's in 
the string from the html variable.

Why is this breaking sometimes? Is this due to a bug, the way the html is, 
or... any ideas?

The code below is the entire script, I'm using Nim 1.2.2:
    
    
     nim
    
    import httpclient, nimquery, xmltree, htmlparser, strtabs, strutils
    
    
    proc googleSearchQuery(query :string,results :int = 0) :string =
        result = query
        if " " in query:
            result = query.replace(" ","+")
        if results != 0:
            let numParam = "&num=" & results.intToStr
            result.add(numParam)
        return "https://google.com/search?q="; & result
    
    
    stdout.writeLine("what is your search query?")
    let searchQuery = googleSearchQuery(stdin.readLine)
    
    
    proc getOrganicSerp(url: string,results :bool = true,
                    titles,snippets,rankings :bool = false) =
        
        let
            userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) 
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            client = newHttpClient(userAgent=userAgent)
        
        let html = client.getContent(url)
        let xml = parseHtml(html) #THIS IS WHERE IT F***S UP
        
        var rsoNode = xml.querySelector("#rso")
        var gNode = rsoNode.querySelectorAll(".g")
        
        var iter = 0
        var iter2 = 0
        var iter3 = 0
        var iter4 = 0
        var iter5 = 0
        
        for gChild in gNode: #for every div with class G
            
            iter += 1
            
            if not gChild.querySelector(".rc").isNil:
                iter3 += 1
                
                for rcNode in gChild.querySelector(".rc"):
                    
                    if not rcNode.querySelector(".r").isNil:
                        echo "----------"
                        echo attrs(rcNode.querySelector(".r").child "a")["href"]
                        echo "----------"
                        iter4 += 1
                    else:
                        iter5 += 1
            
            else: #If every div with class G has a child div with class RC
                iter2 += 1
        
        echo "gChild: ", iter
        echo "Else's within the first if else: ", iter2
        echo "If Not Nil: ", iter3
        echo "A Tag: ", iter4
        echo "No A Tag: ", iter5
        
        client.close()
    
    
    getOrganicSerp(searchQuery)
    
    
    
    Run

htmlparser bug? Help with my web scraper

Reply via email to