Update of /cvsroot/audacity/audacity-src/scripts/mw2html_audacity
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv15184
Modified Files:
mw2html.py
Log Message:
Fixing bugs reported by Martyn Shaw:
- CSS styles
- Anchor links
Index: mw2html.py
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/mw2html.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- mw2html.py 29 May 2009 04:04:20 -0000 1.2
+++ mw2html.py 1 Jun 2009 01:13:57 -0000 1.3
@@ -30,7 +30,7 @@
import errno
import hashlib
import httplib
-#import pdb
+import pdb
from time import strftime
try:
@@ -45,6 +45,7 @@
print ' http://www.connellybarnes.com/code/htmldata/'
sys.exit()
+config = None
MOVE_HREF = 'movehref'
MADE_BY_COMMENT = '<!-- Content generated by Mediawiki and mw2html -->'
INDEX_HTML = 'index.html'
@@ -101,23 +102,28 @@
Get domain of URL.
"""
url = normalize_url(u)
- pos = url.find('/')
- if pos == -1:
- return url
- else:
- return url[:pos]
+
+ #ParseResult(scheme='http', netloc='www.cwi.nl:80',
path='/%7Eguido/Python.html', params='', query='', fragment='')
+ L = list(urlparse.urlparse(url))
+
+ return L[1]
def normalize_url(url):
-# url normalization
- curl = url.lower()
- nurl = url
+# url normalization - only for local comparison operations, use original url
for online requests
+ url = split_section(url)[0]
+ nurl = url.lower()
- if curl.startswith('http://'):
+ if nurl.startswith('http://'):
nurl = nurl[len('http://'):]
- if curl.startswith('www.'):
+ if nurl.startswith('www.'):
nurl = nurl[len('www.'):]
nurl = nurl.strip('/')
+
+ nurl = 'http://' + nurl
+
+ urlparse.urljoin(config.rooturl, nurl)
+
return nurl
def find_tag_limits(doc, start_string, end_tag, start_tag):
@@ -145,12 +151,12 @@
return ndoc
ndoc = ndoc[:i1]+ndoc[i2+len(end_tag):]
-def monobook_fix_html(doc, config, page_url):
+def monobook_fix_html(doc, page_url):
"""
Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output.
Also returns new urls eventually found.
"""
- global sidebar_content
+ global sidebar_content, config
if config.made_by:
doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
@@ -194,7 +200,7 @@
doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc)
# Remove Audacity syntax
- doc = re.sub('<style
type="text/css">/\*<!\[CDATA\[\*/[\s\S]+?</style>','',doc)
+ #doc = re.sub('<style
type="text/css">/\*<!\[CDATA\[\*/[\s\S]+?</style>','',doc)
# Remove noexport
doc = remove_tag(doc,'<div class="noexport"','</div>', '<div')
@@ -220,32 +226,33 @@
return doc
-def pre_html_transform(doc, url, config):
+def pre_html_transform(doc, url):
"""
User-customizable HTML transform.
Given an HTML document (with URLs already rewritten), returns
modified HTML document and new urls from sidebar.
"""
+ global config
new_urls = []
if config.hack_skin:
if config.skin == MONOBOOK_SKIN:
- doc = monobook_fix_html(doc, config, url)
+ doc = monobook_fix_html(doc, url)
if not config.special_mode:
- doc = monobook_hack_skin_html(doc, config)
+ doc = monobook_hack_skin_html(doc)
else:
raise ValueError('unknown skin')
if config.move_href:
- doc = fix_move_href_tags(doc, config)
+ doc = fix_move_href_tags(doc)
if config.remove_history:
- doc = html_remove_image_history(doc, config)
+ doc = html_remove_image_history(doc)
return doc
-def pos_html_transform(doc, config):
- global footer_text
+def pos_html_transform(doc):
+ global footer_text, config
if config.special_mode:
# Remove external link rel stylesheet
@@ -280,7 +287,7 @@
return doc
-def fix_move_href_tags(doc, config):
+def fix_move_href_tags(doc):
"""
Return copy of doc with all MOVE_HREF tags removed.
"""
@@ -304,7 +311,7 @@
doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
return doc
-def html_remove_image_history(doc, config):
+def html_remove_image_history(doc):
"""
Remove image history and links to information.
"""
@@ -312,7 +319,7 @@
doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc)
return doc
-def monobook_hack_skin_html(doc, config):
+def monobook_hack_skin_html(doc):
"""
Hacks Monobook HTML output: use CSS ids for hacked skin.
@@ -323,7 +330,7 @@
doc = doc.replace('</body>', '<br></body>')
return doc
-def monobook_hack_skin_css(doc, url, config):
+def monobook_hack_skin_css(doc, url):
"""
Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks.
@@ -331,6 +338,8 @@
an orange bar at the top, and clear the orange bar right above the
footer.
"""
+ global config
+
if not url.endswith('monobook/main.css'):
return doc
@@ -377,21 +386,24 @@
return doc
-def post_css_transform(doc, url, config):
+def post_css_transform(doc, url):
"""
User-customizable CSS transform.
Given a CSS document (with URLs already rewritten), returns
modified CSS document.
"""
+ global config
+
if config.hack_skin and not config.special_mode:
if config.skin == MONOBOOK_SKIN:
- doc = monobook_hack_skin_css(doc, url, config)
+ doc = monobook_hack_skin_css(doc, url)
else:
raise ValueError('unknown skin')
return doc
-def move_to_index_if_needed(config, ans):
+def move_to_index_if_needed(ans):
+ global config
if ans.endswith(config.index):
ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
return ans
@@ -418,9 +430,11 @@
return fullname
i += 1
-def clean_filename(url, config, ans):
+def clean_filename(url, ans):
# Split outdir and our file/dir under outdir
# (Note: ans may not be a valid filename)
+ global config
+
(par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):])
if ans.startswith(os.sep):
ans = ans[1:]
@@ -447,7 +461,8 @@
ans = 'math_' + hashlib.md5(tail).hexdigest()[:4] + '.png'
return os.path.join(par, ans)
-def flatten_filename(url, config, filename):
+def flatten_filename(url, filename):
+ global config
def get_fullname(relname):
return os.path.join(config.outdir, relname)
@@ -544,17 +559,13 @@
return (doc, mimetype)
-def url_to_filename(url, config):
+def url_to_filename(url):
"""
Translate a full url to a full filename (in local OS format) under outdir.
Transforms web url into local url and caches it.
Downloads the file to disk and works with it there instead of download the
same file two times (Performance Improvement).
"""
- if get_domain(url) != domain:
- url = normalize_url(urlparse.urljoin(config.rooturl, url))
-
- url = split_section(url)[0]
-
+ global config
nurl = normalize_url(url)
if nurl in url_filename_cache:
@@ -615,13 +626,13 @@
ans = os.path.join(config.outdir, subfile)
if config.flatten:
- ans = flatten_filename(url, config, ans)
+ ans = flatten_filename(url, ans)
if config.clean:
- ans = clean_filename(url, config, ans)
+ ans = clean_filename(url, ans)
if config.index != None:
- ans = move_to_index_if_needed(config, ans)
+ ans = move_to_index_if_needed(ans)
ans = find_unused_filename(ans, file_exists_in_written_set)
@@ -650,7 +661,7 @@
return ans
-def url_to_relative(url, cururl, config):
+def url_to_relative(url, cururl):
"""
Translate a full url to a filename (in URL format) relative to cururl.
Relative url from curul to url.
@@ -658,11 +669,11 @@
cururl = split_section(cururl)[0]
(url, section) = split_section(url)
- L1 = url_to_filename(url, config).replace(os.sep, '/').split('/')
+ L1 = url_to_filename(url).replace(os.sep, '/').split('/')
if L1 == '':
return ''
- L2 = url_to_filename(cururl, config).replace(os.sep, '/').split('/')
+ L2 = url_to_filename(cururl).replace(os.sep, '/').split('/')
while L1 != [] and L2 != [] and L1[0] == L2[0]:
L1 = L1[1:]
@@ -670,11 +681,13 @@
return urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
-def parse_css(doc, url, config):
+def parse_css(doc, url):
"""
Returns (modified_doc, new_urls), where new_urls are absolute URLs for
all links found in the CSS.
"""
+ global config
+
new_urls = []
L = htmldata.urlextract(doc, url, 'text/css')
@@ -687,34 +700,37 @@
continue
new_urls += [u]
- item.url = url_to_relative(u, url, config)
+ item.url = url_to_relative(u, url)
newdoc = htmldata.urljoin(doc, L)
- newdoc = post_css_transform(newdoc, url, config)
+ newdoc = post_css_transform(newdoc, url)
return (newdoc, new_urls)
-def should_follow(url, config):
+def should_follow(url):
"""
Returns a boolean for whether url should be spidered
Given that 'url' was linked to from site, return whether
'url' should be spidered as well.
"""
+ global config
+
# False if different domains.
- url = urlparse.urljoin(config.rooturl, url)
- if get_domain(config.rooturl) != get_domain(url):
+ nurl = normalize_url(url)
+ if get_domain(config.rooturl) != get_domain(nurl):
if config.debug:
print url, 'not in the same domain'
return False
# False if multiple query fields or parameters found
- if url.count('&') >= 1 or url.count(';') > 0:
+ if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in
('.css', 'gen=css')):
if config.debug:
print url, 'with multiple query fields'
return False
- if any(x in url for x in ('MediaWiki:', 'Special:', 'Image:', 'Talk:',
'User:', 'Help:')):
+ #if any(x in url for x in ('MediaWiki:', 'Special:', 'Image:', 'Talk:',
'User:', 'Help:')):
+ if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:')):
if config.debug:
print url, 'is a forbidden wiki page'
return False
@@ -725,7 +741,6 @@
return False
# limit_parent support
- nurl = normalize_url(url)
ncurl = normalize_url(config.rooturl)
if config.limit_parent and not nurl.startswith(ncurl):
@@ -744,17 +759,19 @@
return True
-def parse_html(doc, url, config):
+def parse_html(doc, url):
"""
Returns (modified_doc, new_urls), where new_urls are absolute URLs for
all links we want to spider in the HTML.
"""
+ global config
+
BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
END_COMMENT_REPLACE = '<ENDCOMMENT-' + str(random.random()) + '>'
new_urls = []
- doc = pre_html_transform(doc, url, config)
+ doc = pre_html_transform(doc, url)
# Temporarily "get rid" of comments so htmldata will find the URLs
# in the funky "<!--[if" HTML hackery for IE.
doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
@@ -764,13 +781,13 @@
for item in L:
u = item.url
- follow = should_follow(u, config)
+ follow = should_follow(u)
if follow:
if config.debug:
print 'ACCEPTED - ', u
# Store url locally.
new_urls += [u]
- item.url = url_to_relative(u, url, config)
+ item.url = url_to_relative(u, url)
else:
if config.debug:
print 'DENIED - ', u
@@ -779,16 +796,16 @@
newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
- newdoc = pos_html_transform(newdoc, config)
+ newdoc = pos_html_transform(newdoc)
return (newdoc, new_urls)
-def run(config, out=sys.stdout):
+def run(out=sys.stdout):
"""
Code interface.
"""
- global conn, domain, counter, redir_cache
+ global conn, domain, counter, redir_cache, config
if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
out.write('Please do not use robots with the Wikipedia site.\n')
@@ -811,33 +828,32 @@
start = True
while len(pending) > 0:
- url = normalize_url(pending.pop())
- if get_domain(url) != domain:
- url = normalize_url(urlparse.urljoin(config.rooturl, url))
+ url = pending.pop()
+ nurl = normalize_url(url)
- if url in redir_cache:
- url = normalize_url(redir_cache[url])
+ if nurl in redir_cache:
+ nurl = redir_cache[nurl]
- if url in complete:
+ if nurl in complete:
if config.debug:
print url, 'already processed'
continue
- complete.add(url)
- filename = url_to_filename(url, config)
+ complete.add(nurl)
+ filename = url_to_filename(url)
#this is needed for the first path as it doesn't know if it is a redirect
or not in the begining
#at this point all the content of redir_cache is relative to the start path
if start:
start = False
- nurl = ''
- for red in redir_cache.iterkeys():
- nurl = normalize_url(red)
- url_filename_cache[nurl] = filename
- if nurl not in complete:
- complete.add(nurl)
- if nurl != '':
- url = normalize_url(redir_cache[url])
+ aux_url = ''
+ for redir in redir_cache.iterkeys():
+ aux_url = normalize_url(redir)
+ url_filename_cache[aux_url] = filename
+ if aux_url not in complete:
+ complete.add(aux_url)
+ if aux_url != '':
+ nurl = normalize_url(redir_cache[nurl])
if filename == '':
continue
@@ -852,9 +868,9 @@
new_urls = []
if filename.endswith('.html'):
- (doc, new_urls) = parse_html(doc, url, config)
+ (doc, new_urls) = parse_html(doc, nurl)
elif filename.endswith('.css'):
- (doc, new_urls) = parse_css(doc, url, config)
+ (doc, new_urls) = parse_css(doc, nurl)
# Enqueue URLs that we haven't yet spidered.
for u in new_urls:
@@ -947,6 +963,7 @@
"""
Command line interface.
"""
+ global config
try:
(opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fsdl:t:b:i:',
['force', 'no-flatten', 'no-clean',
@@ -1003,7 +1020,7 @@
config.index = arg
# Run program
- run(config)
+ run()
if __name__ == '__main__':
------------------------------------------------------------------------------
Register Now for Creativity and Technology (CaT), June 3rd, NYC. CaT
is a gathering of tech-side developers & brand creativity professionals. Meet
the minds behind Google Creative Lab, Visual Complexity, Processing, &
iPhoneDevCamp as they present alongside digital heavyweights like Barbarian
Group, R/GA, & Big Spaceship. http://p.sf.net/sfu/creativitycat-com
_______________________________________________
Audacity-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/audacity-cvs