Hi,
here at ADS we have a few scanned articles with OCRed text. The articles
are public (e.g. like this one:
http://articles.adsabs.harvard.edu/full/1870AN.....77Q..61.) but the
OCRed text is not. In order to access it, we need a magic set of HTTP
headers. Currently there is no way to have bibindex use specific headers
for certain URLs. I think that it might be useful for other people to
have this possibility so let me share it with you.
The attached patch permits just that.
Benoit.
>From 8ff399176799b246f825ac6f5c9d1bc91c51d1e0 Mon Sep 17 00:00:00 2001
From: Benoit Thiell <[email protected]>
Date: Thu, 13 May 2010 18:13:57 -0400
Subject: [PATCH] WebSubmit: Specific HTTP headers for URL downloads
* The variable websubmit_config.CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS
permits to use specific HTTP headers for URLs matching a regular
expression.
---
modules/websubmit/lib/bibdocfile.py | 13 +++++++++++--
modules/websubmit/lib/websubmit_config.py | 8 ++++++++
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/modules/websubmit/lib/bibdocfile.py
b/modules/websubmit/lib/bibdocfile.py
index 27be3cd..e6c38ce 100644
--- a/modules/websubmit/lib/bibdocfile.py
+++ b/modules/websubmit/lib/bibdocfile.py
@@ -101,7 +101,7 @@ from invenio.config import CFG_SITE_LANG, CFG_SITE_URL, \
CFG_BIBDOCFILE_USE_XSENDFILE, \
CFG_BIBDOCFILE_MD5_CHECK_PROBABILITY
from invenio.websubmit_config import CFG_WEBSUBMIT_ICON_SUBFORMAT_RE, \
- CFG_WEBSUBMIT_DEFAULT_ICON_SUBFORMAT
+ CFG_WEBSUBMIT_DEFAULT_ICON_SUBFORMAT, CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS
from invenio.bibformat import format_record
import invenio.template
websubmit_templates = invenio.template.load('websubmit')
@@ -3328,7 +3328,16 @@ def download_url(url, format=None, sleep=2):
raise StandardError, "%s is not in one of the allowed paths."
% path
else:
try:
- from_file = urllib2.urlopen(url)
+ for regex, headers in \
+ CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS.items():
+ if regex.search(url) is not None:
+ # Matching URL: Use the specific headers.
+ request = urllib2.Request(url, headers=headers)
+ break
+ else:
+ request = urllib2.Request(url)
+
+ from_file = urllib2.urlopen(request)
to_file = open(tmppath, 'w')
while True:
block = from_file.read(CFG_BIBDOCFILE_BLOCK_SIZE)
diff --git a/modules/websubmit/lib/websubmit_config.py
b/modules/websubmit/lib/websubmit_config.py
index 5848447..1dd551c 100644
--- a/modules/websubmit/lib/websubmit_config.py
+++ b/modules/websubmit/lib/websubmit_config.py
@@ -90,6 +90,14 @@ CFG_WEBSUBMIT_ICON_SUBFORMAT_RE = re.compile(r"icon.*")
## when creating new icons.
CFG_WEBSUBMIT_DEFAULT_ICON_SUBFORMAT = "icon"
+## CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS -- this dictionary allows to use
+## specific HTTP headers to download URLs.
+## Example:
+## CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS = {
+## re.compile('http://myurl.com/*'): {'User-Agent': 'Me'},
+## }
+CFG_WEBSUBMIT_URL_DOWNLOAD_HEADERS = {}
+
class InvenioWebSubmitFunctionError(Exception):
"""This exception should only ever be raised by WebSubmit functions.
--
1.5.5.6