Here. Is the code. If you want the *.html and JSON data you can scrape it 
with this. That said, I will work on converting it all to *.csv as soon as 
my R console is free. I hope this helps. Cheers. 

On Saturday, March 1, 2014 12:23:17 AM UTC-5, Fenella C wrote:

> Hello everyone, 
>
> I am wondering if any of you have the village-wise 2001 Indian census data 
> in a spreadsheet (or similar) format? I am basically looking for 
> information at the village level from the 2001 census (e.g., population of 
> the village, number of households in the village, etc.)
>
> The data is available online at the census website here 
> http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx
>  
> but it is not available in a spreadsheet. I have already tried web scraping 
> the data, but it is painfully slow, so I'm wondering if I can find it 
> elsewhere.
>
> Many thanks,
> Fenella
>

-- 
Datameet is a community of Data Science enthusiasts in India. Know more about 
us by visiting http://datameet.org
--- 
You received this message because you are subscribed to the Google Groups 
"datameet" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
For more options, visit https://groups.google.com/d/optout.
#!python2.7
"""
Download all data from the 2001 India Census: http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx.

- saves the html for each village to a file in directory html/
- Saves a list of dict objects with metadata about each village to metadata.json

"""
import json
import os
import re
import shutil
import sys
import time
from os import path
import gzip
import codecs

import requests
from bs4 import BeautifulSoup

def write_gzipped(filename, content):
    with gzip.open(filename, 'wb') as f:
        f.write(content)

class InvalidPostback(Exception):
     def __init__(self, response, data):
         self.response = response
         self.data = data
         
     def __str__(self):
         message = "Cookies: %s\nForm: %s\nText: %s" % (dict(self.response.cookies),
                                                        self.data,
                                                        BeautifulSoup(self.response.text).prettify())
         return message

STATES = {
	'01': "Jammu & Kashmir",
	'02': "Himachal Pradesh",
	'03': "Punjab",
	'04': "Chandigarh",
	'05': "Uttarakhand",
	'06': "Haryana",
	'07': "NCT of Delhi",
	'08': "Rajasthan",
	'09': "Uttar Pradesh",
	'10': "Bihar",
	'11': "Sikkim",
	'12': "Arunachal Pradesh",
	'13': "Nagaland",
	'14': "Manipur",
	'15': "Mizoram",
	'16': "Tripura",
	'17': "Meghalaya",
	'18': "Assam",
	'19': "West Bengal",
	'20': "Jharkhand",
	'21': "Orissa",
	'22': "Chhattisgarh",
	'23': "Madhya Pradesh",
	'24': "Gujarat",
	'25': "Daman & Diu",
	'26': "Dadra & Nagar Haveli",
	'27': "Maharastra",
	'28': "Andhra Pradesh",
	'29': "Karnataka",
	'30': "Goa",
	'31': "Lakshadweep",
	'32': "Kerala",
	'33': "Tamil Nadu",
	'34': "Puducherry",
	'35': "Andaman and Nicobar Islands",
    }

def get_aspx_stuff(soup):
    """ Pull out the current values of the form from a BeautifulSoup object of the webpage """
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    # lastfocus = soup.select("#__LASTFOCUS")[0]['value']
    # eventtarget = soup.select("#__EVENTTARGET")[0]['value']
    # eventargument = soup.select("#__EVENTARGUMENT")[0]['value']
    drpState = soup.select('#ctl00_Body_Content_drpState')[0].find('option', selected = True)['value']
    drpDistrict = soup.select('#ctl00_Body_Content_drpDistrict')[0].find('option', selected = True)['value']
    drpSubDistrict = soup.select('#ctl00_Body_Content_drpSubDistrict')[0].find('option', selected = True)['value']
    drpVillage = soup.select('#ctl00_Body_Content_drpVillage')[0].find('option', selected = True)['value']
    ret = {'__VIEWSTATE' : viewstate,
           '__EVENTVALIDATION' : eventvalidation,
           'ctl00$Body_Content$drpState' : drpState,
           'ctl00$Body_Content$drpDistrict' : drpDistrict,
           'ctl00$Body_Content$drpSubDistrict' : drpSubDistrict,
           'ctl00$Body_Content$drpVillage' : drpVillage}
    # '__EVENTTARGET' : eventtarget,
    # '__EVENTARGUMENT' : eventargument,
    # '__LASTFOCUS' : lastfocus}
    return ret

def get_states(soup):
    """ Extract list of states from the webpage """
    ret = {}
    for x in soup.select('#ctl00_Body_Content_drpState')[0].findAll('option'):
        value = x['value']
        if value != 'null':
            ret[value] = x.text
    return ret

def get_districts(soup):
    """ Extract list of districts from the webpage """    
    ret = {}
    for x in soup.select('#ctl00_Body_Content_drpDistrict')[0].findAll('option'):
        value = x['value']
        if value != 'null':
            ret[value] = x.text
    return ret

def get_sub_districts(soup):
    """ Extract list of sub districts from the webpage """    
    ret = {}
    for x in soup.select('#ctl00_Body_Content_drpSubDistrict')[0].findAll('option'):
        value = x['value']
        if value != 'null':
            ret[value] = x.text
    return ret

def get_villages(soup):
    """ Extract list of villages from the webpage """    
    ret = {}
    villages = soup.select('#ctl00_Body_Content_drpVillage')
    for x in villages[0].findAll('option'):
        value = x['value']
        if value != 'null':
            ret[value] = x.text
    return ret

class browser(object):

    def __init__(self):
        self.session = requests.Session()
        self.FORM = {
            '__EVENTTARGET' : '',
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '',
            'ctl00$Body_Content$drpState' : 'null',
            'ctl00$Body_Content$drpDistrict' : 'null',
            'ctl00$Body_Content$drpSubDistrict' : 'null',
            'ctl00$Body_Content$drpVillage' : 'null',
        }
        self.states = {}
        self.districts = {}
        self.subdistricts = {}
        self.villages = []
        self.URL = "http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx";
        r = self.session.post(self.URL)
        soup = BeautifulSoup(r.text)
        self.update_form(soup)
        self.states = get_states(soup)

    def post(self):
        """ Post to url; the post data is in self.FORM """
        r = self.session.post(self.URL, data = self.FORM)
        self.response = r
        if r.status_code == 500:
            raise InvalidPostback(r, self.FORM)
        return r

    def update_form(self, soup):
        """ Update self.FORM based on form values in the webpage """
        self.FORM.update(get_aspx_stuff(soup))

    def post_state(self, state):
        self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpState',
                          'ctl00$Body_Content$drpState': state
                      })
        r = self.post()
        soup = BeautifulSoup(r.text)
        self.districts = get_districts(soup)
        self.update_form(soup)
        return r.text

    def post_district(self, district):
        self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpDistrict',
                          'ctl00$Body_Content$drpDistrict': district})
        r = self.post()
        soup = BeautifulSoup(r.text)
        self.subdistricts = get_sub_districts(soup)
        self.update_form(soup)
        return r.text

    def post_subdistrict(self, subdistrict):
        self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpSubDistrict',
                          'ctl00$Body_Content$drpSubDistrict': subdistrict})
        r = self.post()
        soup = BeautifulSoup(r.text)
        self.villages = get_villages(soup)
        self.update_form(soup)            
        return r.text

    def post_village(self, village):
        self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpVillage',
                          'ctl00$Body_Content$drpVillage': village})
        r = self.post()
        soup = BeautifulSoup(r.text)
        self.update_form(soup)
        self.FORM.update({'__EVENTTARGET' : '',
                          'ctl00$Body_Content$btnSubmit': 'Submit'})
        r = self.post()
        soup = BeautifulSoup(r.text)
        del self.FORM['ctl00$Body_Content$btnSubmit']
        return r.text

def run_state(state, state_name, dir):
    print("State: %s %s" % (state, state_name))
    br = browser()
    br.post_state(state)
    districts = br.districts
    metadata = {}
    for district, district_name  in districts.items():
        print("District: %s %s" % (district, district_name))
        br.post_district(district)
        subdistricts = br.subdistricts
        for subdist, subdist_name in subdistricts.items():
            print("Sub District: %s %s" % (subdist, subdist_name))
            br.post_subdistrict(subdist)
            villages = br.villages
            for village, village_name in villages.items():
                r = br.post_village(village)
                key = state + district + subdist + village
                data = {'state' : state,
                        'state_name' : state_name,
                        'district' : district,
                        'district_name' : district_name,
                        'sub_district' : subdist,
                        'sub_district_name' : subdist_name,
                        'village' : village,
                        'village_name' : village_name}
                metadata[key] = data 
                with codecs.open(path.join(DATA_DIR, '%s.html' % key), 'w', encoding = "utf-8") as f:
                    f.write(r)
                # write_gzipped(path.join(DATA_DIR, '%s.html.gz' % key), 'w'), r)
    return metadata

if __name__ == "__main__":
    
    DATA_DIR = 'html'
    if not path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)

    states = STATES.keys()
    # states = ['01', '02', '03']
    states = ['05', '06', '07', '08', '09', '10']
    for k in states:
        metadata = run_state(k, STATES[k], DATA_DIR)
        with codecs.open('metadata%s.json' % k, 'w', encoding = "utf-8") as f:
            json.dump(metadata, f)

Reply via email to