Here. Is the code. If you want the *.html and JSON data you can scrape it with this. That said, I will work on converting it all to *.csv as soon as my R console is free. I hope this helps. Cheers.
On Saturday, March 1, 2014 12:23:17 AM UTC-5, Fenella C wrote: > Hello everyone, > > I am wondering if any of you have the village-wise 2001 Indian census data > in a spreadsheet (or similar) format? I am basically looking for > information at the village level from the 2001 census (e.g., population of > the village, number of households in the village, etc.) > > The data is available online at the census website here > http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx > > but it is not available in a spreadsheet. I have already tried web scraping > the data, but it is painfully slow, so I'm wondering if I can find it > elsewhere. > > Many thanks, > Fenella > -- Datameet is a community of Data Science enthusiasts in India. Know more about us by visiting http://datameet.org --- You received this message because you are subscribed to the Google Groups "datameet" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. For more options, visit https://groups.google.com/d/optout.
#!python2.7 """ Download all data from the 2001 India Census: http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx. - saves the html for each village to a file in directory html/ - Saves a list of dict objects with metadata about each village to metadata.json """ import json import os import re import shutil import sys import time from os import path import gzip import codecs import requests from bs4 import BeautifulSoup def write_gzipped(filename, content): with gzip.open(filename, 'wb') as f: f.write(content) class InvalidPostback(Exception): def __init__(self, response, data): self.response = response self.data = data def __str__(self): message = "Cookies: %s\nForm: %s\nText: %s" % (dict(self.response.cookies), self.data, BeautifulSoup(self.response.text).prettify()) return message STATES = { '01': "Jammu & Kashmir", '02': "Himachal Pradesh", '03': "Punjab", '04': "Chandigarh", '05': "Uttarakhand", '06': "Haryana", '07': "NCT of Delhi", '08': "Rajasthan", '09': "Uttar Pradesh", '10': "Bihar", '11': "Sikkim", '12': "Arunachal Pradesh", '13': "Nagaland", '14': "Manipur", '15': "Mizoram", '16': "Tripura", '17': "Meghalaya", '18': "Assam", '19': "West Bengal", '20': "Jharkhand", '21': "Orissa", '22': "Chhattisgarh", '23': "Madhya Pradesh", '24': "Gujarat", '25': "Daman & Diu", '26': "Dadra & Nagar Haveli", '27': "Maharastra", '28': "Andhra Pradesh", '29': "Karnataka", '30': "Goa", '31': "Lakshadweep", '32': "Kerala", '33': "Tamil Nadu", '34': "Puducherry", '35': "Andaman and Nicobar Islands", } def get_aspx_stuff(soup): """ Pull out the current values of the form from a BeautifulSoup object of the webpage """ viewstate = soup.select("#__VIEWSTATE")[0]['value'] eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value'] # lastfocus = soup.select("#__LASTFOCUS")[0]['value'] # eventtarget = soup.select("#__EVENTTARGET")[0]['value'] # eventargument = soup.select("#__EVENTARGUMENT")[0]['value'] drpState = soup.select('#ctl00_Body_Content_drpState')[0].find('option', selected = True)['value'] drpDistrict = soup.select('#ctl00_Body_Content_drpDistrict')[0].find('option', selected = True)['value'] drpSubDistrict = soup.select('#ctl00_Body_Content_drpSubDistrict')[0].find('option', selected = True)['value'] drpVillage = soup.select('#ctl00_Body_Content_drpVillage')[0].find('option', selected = True)['value'] ret = {'__VIEWSTATE' : viewstate, '__EVENTVALIDATION' : eventvalidation, 'ctl00$Body_Content$drpState' : drpState, 'ctl00$Body_Content$drpDistrict' : drpDistrict, 'ctl00$Body_Content$drpSubDistrict' : drpSubDistrict, 'ctl00$Body_Content$drpVillage' : drpVillage} # '__EVENTTARGET' : eventtarget, # '__EVENTARGUMENT' : eventargument, # '__LASTFOCUS' : lastfocus} return ret def get_states(soup): """ Extract list of states from the webpage """ ret = {} for x in soup.select('#ctl00_Body_Content_drpState')[0].findAll('option'): value = x['value'] if value != 'null': ret[value] = x.text return ret def get_districts(soup): """ Extract list of districts from the webpage """ ret = {} for x in soup.select('#ctl00_Body_Content_drpDistrict')[0].findAll('option'): value = x['value'] if value != 'null': ret[value] = x.text return ret def get_sub_districts(soup): """ Extract list of sub districts from the webpage """ ret = {} for x in soup.select('#ctl00_Body_Content_drpSubDistrict')[0].findAll('option'): value = x['value'] if value != 'null': ret[value] = x.text return ret def get_villages(soup): """ Extract list of villages from the webpage """ ret = {} villages = soup.select('#ctl00_Body_Content_drpVillage') for x in villages[0].findAll('option'): value = x['value'] if value != 'null': ret[value] = x.text return ret class browser(object): def __init__(self): self.session = requests.Session() self.FORM = { '__EVENTTARGET' : '', '__EVENTARGUMENT' : '', '__LASTFOCUS' : '', 'ctl00$Body_Content$drpState' : 'null', 'ctl00$Body_Content$drpDistrict' : 'null', 'ctl00$Body_Content$drpSubDistrict' : 'null', 'ctl00$Body_Content$drpVillage' : 'null', } self.states = {} self.districts = {} self.subdistricts = {} self.villages = [] self.URL = "http://www.censusindia.gov.in/Census_Data_2001/Village_Directory/View_data/Village_Profile.aspx" r = self.session.post(self.URL) soup = BeautifulSoup(r.text) self.update_form(soup) self.states = get_states(soup) def post(self): """ Post to url; the post data is in self.FORM """ r = self.session.post(self.URL, data = self.FORM) self.response = r if r.status_code == 500: raise InvalidPostback(r, self.FORM) return r def update_form(self, soup): """ Update self.FORM based on form values in the webpage """ self.FORM.update(get_aspx_stuff(soup)) def post_state(self, state): self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpState', 'ctl00$Body_Content$drpState': state }) r = self.post() soup = BeautifulSoup(r.text) self.districts = get_districts(soup) self.update_form(soup) return r.text def post_district(self, district): self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpDistrict', 'ctl00$Body_Content$drpDistrict': district}) r = self.post() soup = BeautifulSoup(r.text) self.subdistricts = get_sub_districts(soup) self.update_form(soup) return r.text def post_subdistrict(self, subdistrict): self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpSubDistrict', 'ctl00$Body_Content$drpSubDistrict': subdistrict}) r = self.post() soup = BeautifulSoup(r.text) self.villages = get_villages(soup) self.update_form(soup) return r.text def post_village(self, village): self.FORM.update({'__EVENTTARGET' : 'ctl00$Body_Content$drpVillage', 'ctl00$Body_Content$drpVillage': village}) r = self.post() soup = BeautifulSoup(r.text) self.update_form(soup) self.FORM.update({'__EVENTTARGET' : '', 'ctl00$Body_Content$btnSubmit': 'Submit'}) r = self.post() soup = BeautifulSoup(r.text) del self.FORM['ctl00$Body_Content$btnSubmit'] return r.text def run_state(state, state_name, dir): print("State: %s %s" % (state, state_name)) br = browser() br.post_state(state) districts = br.districts metadata = {} for district, district_name in districts.items(): print("District: %s %s" % (district, district_name)) br.post_district(district) subdistricts = br.subdistricts for subdist, subdist_name in subdistricts.items(): print("Sub District: %s %s" % (subdist, subdist_name)) br.post_subdistrict(subdist) villages = br.villages for village, village_name in villages.items(): r = br.post_village(village) key = state + district + subdist + village data = {'state' : state, 'state_name' : state_name, 'district' : district, 'district_name' : district_name, 'sub_district' : subdist, 'sub_district_name' : subdist_name, 'village' : village, 'village_name' : village_name} metadata[key] = data with codecs.open(path.join(DATA_DIR, '%s.html' % key), 'w', encoding = "utf-8") as f: f.write(r) # write_gzipped(path.join(DATA_DIR, '%s.html.gz' % key), 'w'), r) return metadata if __name__ == "__main__": DATA_DIR = 'html' if not path.exists(DATA_DIR): os.makedirs(DATA_DIR) states = STATES.keys() # states = ['01', '02', '03'] states = ['05', '06', '07', '08', '09', '10'] for k in states: metadata = run_state(k, STATES[k], DATA_DIR) with codecs.open('metadata%s.json' % k, 'w', encoding = "utf-8") as f: json.dump(metadata, f)
