mw2html_audacity mw2html.py, 1.2, 1.3

André Pinto Sun, 31 May 2009 18:14:06 -0700

Update of /cvsroot/audacity/audacity-src/scripts/mw2html_audacity
In directory 23jxhf1.ch3.sourceforge.com:/tmp/cvs-serv15184


Modified Files:
        mw2html.py 
Log Message:
Fixing bugs reported by Martyn Shaw:
- CSS styles
- Anchor links

Index: mw2html.py
===================================================================
RCS file: /cvsroot/audacity/audacity-src/scripts/mw2html_audacity/mw2html.py,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- mw2html.py  29 May 2009 04:04:20 -0000      1.2
+++ mw2html.py  1 Jun 2009 01:13:57 -0000       1.3
@@ -30,7 +30,7 @@
 import errno
 import hashlib
 import httplib
-#import pdb
+import pdb
 from time import strftime
 
 try:
@@ -45,6 +45,7 @@
   print '  http://www.connellybarnes.com/code/htmldata/'
   sys.exit()
 
+config             = None
 MOVE_HREF          = 'movehref'
 MADE_BY_COMMENT    = '<!-- Content generated by Mediawiki and mw2html -->'
 INDEX_HTML         = 'index.html'
@@ -101,23 +102,28 @@
   Get domain of URL.
   """
   url = normalize_url(u)
-  pos = url.find('/')
-  if pos == -1:
-    return url
-  else:
-    return url[:pos]
+  
+  #ParseResult(scheme='http', netloc='www.cwi.nl:80', 
path='/%7Eguido/Python.html', params='', query='', fragment='')
+  L = list(urlparse.urlparse(url))
+  
+  return L[1]
 
 def normalize_url(url):
-# url normalization
-  curl = url.lower()
-  nurl = url
+# url normalization - only for local comparison operations, use original url 
for online requests
+  url = split_section(url)[0] 
+  nurl = url.lower()
   
-  if curl.startswith('http://'):
+  if nurl.startswith('http://'):
     nurl = nurl[len('http://'):]
-  if curl.startswith('www.'):
+  if nurl.startswith('www.'):
     nurl = nurl[len('www.'):]
   
   nurl = nurl.strip('/')
+  
+  nurl = 'http://' + nurl
+  
+  urlparse.urljoin(config.rooturl, nurl)
+  
   return nurl
 
 def find_tag_limits(doc, start_string, end_tag, start_tag):
@@ -145,12 +151,12 @@
       return ndoc
     ndoc = ndoc[:i1]+ndoc[i2+len(end_tag):]
  
-def monobook_fix_html(doc, config, page_url):
+def monobook_fix_html(doc, page_url):
   """
   Sets sidebar for Mediawiki 1.4beta6 Monobook HTML output.
   Also returns new urls eventually found.
   """
-  global sidebar_content
+  global sidebar_content, config
  
   if config.made_by:
     doc = doc.replace('<html xmlns=', MADE_BY_COMMENT + '\n<html xmlns=')
@@ -194,7 +200,7 @@
     doc = re.sub(r'<div class="printfooter">[\s\S]+?</div>',r'',doc)
  
     # Remove Audacity syntax
-    doc = re.sub('<style 
type="text/css">/\*<!\[CDATA\[\*/[\s\S]+?</style>','',doc)
+    #doc = re.sub('<style 
type="text/css">/\*<!\[CDATA\[\*/[\s\S]+?</style>','',doc)
 
     # Remove noexport
     doc = remove_tag(doc,'<div class="noexport"','</div>', '<div')
@@ -220,32 +226,33 @@
 
   return doc
   
-def pre_html_transform(doc, url, config):
+def pre_html_transform(doc, url):
   """
   User-customizable HTML transform.
 
   Given an HTML document (with URLs already rewritten), returns
   modified HTML document and new urls from sidebar.
   """
+  global config
   new_urls = []
   
   if config.hack_skin:
     if config.skin == MONOBOOK_SKIN:
-      doc = monobook_fix_html(doc, config, url)
+      doc = monobook_fix_html(doc, url)
       if not config.special_mode:
-        doc = monobook_hack_skin_html(doc, config)
+        doc = monobook_hack_skin_html(doc)
     else:
       raise ValueError('unknown skin')
   
   if config.move_href:
-    doc = fix_move_href_tags(doc, config)
+    doc = fix_move_href_tags(doc)
   if config.remove_history:
-    doc = html_remove_image_history(doc, config)
+    doc = html_remove_image_history(doc)
   
   return doc
   
-def pos_html_transform(doc, config):
-  global footer_text
+def pos_html_transform(doc):
+  global footer_text, config
   
   if config.special_mode:
     # Remove external link rel stylesheet
@@ -280,7 +287,7 @@
     
   return doc
 
-def fix_move_href_tags(doc, config):
+def fix_move_href_tags(doc):
   """
   Return copy of doc with all MOVE_HREF tags removed.
   """
@@ -304,7 +311,7 @@
     doc = doc[:start] + htmldata.tagjoin(new_tags) + doc[end:]
   return doc
 
-def html_remove_image_history(doc, config):
+def html_remove_image_history(doc):
   """
   Remove image history and links to information.
   """
@@ -312,7 +319,7 @@
   doc = re.sub(r'<h2>Image links</h2>[\s\S]+?</ul>', r'', doc)
   return doc
 
-def monobook_hack_skin_html(doc, config):
+def monobook_hack_skin_html(doc):
   """
   Hacks Monobook HTML output: use CSS ids for hacked skin.
 
@@ -323,7 +330,7 @@
   doc = doc.replace('</body>', '<br></body>')  
   return doc
 
-def monobook_hack_skin_css(doc, url, config):
+def monobook_hack_skin_css(doc, url):
   """
   Hacks Mediawiki 1.4beta6 Monobook main CSS file for better looks.
 
@@ -331,6 +338,8 @@
   an orange bar at the top, and clear the orange bar right above the
   footer.
   """
+  global config
+  
   if not url.endswith('monobook/main.css'):
     return doc
 
@@ -377,21 +386,24 @@
 
   return doc
 
-def post_css_transform(doc, url, config):
+def post_css_transform(doc, url):
   """
   User-customizable CSS transform.
 
   Given a CSS document (with URLs already rewritten), returns
   modified CSS document.
   """
+  global config
+  
   if config.hack_skin and not config.special_mode:
     if config.skin == MONOBOOK_SKIN:
-      doc = monobook_hack_skin_css(doc, url, config)
+      doc = monobook_hack_skin_css(doc, url)
     else:
       raise ValueError('unknown skin')
   return doc
   
-def move_to_index_if_needed(config, ans):
+def move_to_index_if_needed(ans):
+  global config
   if ans.endswith(config.index):
     ans = ans[:len(ans)-len(config.index)] + INDEX_HTML
   return ans
@@ -418,9 +430,11 @@
       return fullname
     i += 1
 
-def clean_filename(url, config, ans):
+def clean_filename(url, ans):
   # Split outdir and our file/dir under outdir
   # (Note: ans may not be a valid filename)
+  global config
+  
   (par, ans) = (ans[:len(config.outdir)], ans[len(config.outdir):])
   if ans.startswith(os.sep):
     ans = ans[1:]
@@ -447,7 +461,8 @@
         ans = 'math_' + hashlib.md5(tail).hexdigest()[:4] + '.png'
   return os.path.join(par, ans)
 
-def flatten_filename(url, config, filename):
+def flatten_filename(url, filename):
+  global config
   def get_fullname(relname):
     return os.path.join(config.outdir, relname)
 
@@ -544,17 +559,13 @@
 
   return (doc, mimetype)
   
-def url_to_filename(url, config):
+def url_to_filename(url):
   """
   Translate a full url to a full filename (in local OS format) under outdir.
   Transforms web url into local url and caches it.
   Downloads the file to disk and works with it there instead of download the 
same file two times (Performance Improvement).
   """ 
-  if get_domain(url) != domain:
-    url = normalize_url(urlparse.urljoin(config.rooturl, url))
-
-  url = split_section(url)[0]
-
+  global config
   nurl = normalize_url(url)
 
   if nurl in url_filename_cache:
@@ -615,13 +626,13 @@
   ans = os.path.join(config.outdir, subfile)
 
   if config.flatten:
-    ans = flatten_filename(url, config, ans)
+    ans = flatten_filename(url, ans)
 
   if config.clean:
-    ans = clean_filename(url, config, ans)
+    ans = clean_filename(url, ans)
 
   if config.index != None:
-    ans = move_to_index_if_needed(config, ans)
+    ans = move_to_index_if_needed(ans)
        
   ans = find_unused_filename(ans, file_exists_in_written_set)
 
@@ -650,7 +661,7 @@
 
   return ans
 
-def url_to_relative(url, cururl, config):
+def url_to_relative(url, cururl):
   """
   Translate a full url to a filename (in URL format) relative to cururl.
   Relative url from curul to url.
@@ -658,11 +669,11 @@
   cururl = split_section(cururl)[0]
   (url, section) = split_section(url)
 
-  L1 = url_to_filename(url,    config).replace(os.sep, '/').split('/')
+  L1 = url_to_filename(url).replace(os.sep, '/').split('/')
   if L1 == '':
     return ''
 
-  L2 = url_to_filename(cururl, config).replace(os.sep, '/').split('/')
+  L2 = url_to_filename(cururl).replace(os.sep, '/').split('/')
 
   while L1 != [] and L2 != [] and L1[0] == L2[0]:
     L1 = L1[1:]
@@ -670,11 +681,13 @@
 
   return urllib.quote('../' * (len(L2) - 1) + '/'.join(L1)) + section
 
-def parse_css(doc, url, config):
+def parse_css(doc, url):
   """
   Returns (modified_doc, new_urls), where new_urls are absolute URLs for
   all links found in the CSS.
   """
+  global config
+  
   new_urls = []  
 
   L = htmldata.urlextract(doc, url, 'text/css')
@@ -687,34 +700,37 @@
       continue
 
     new_urls += [u]
-    item.url = url_to_relative(u, url, config)
+    item.url = url_to_relative(u, url)
 
   newdoc = htmldata.urljoin(doc, L)
-  newdoc = post_css_transform(newdoc, url, config)
+  newdoc = post_css_transform(newdoc, url)
 
   return (newdoc, new_urls)
 
-def should_follow(url, config):
+def should_follow(url):
   """
   Returns a boolean for whether url should be spidered
 
   Given that 'url' was linked to from site, return whether
   'url' should be spidered as well.
   """
+  global config
+  
   # False if different domains.
-  url = urlparse.urljoin(config.rooturl, url)
-  if get_domain(config.rooturl) != get_domain(url):
+  nurl = normalize_url(url)
+  if get_domain(config.rooturl) != get_domain(nurl):
     if config.debug:
       print url, 'not in the same domain'
     return False
   
   # False if multiple query fields or parameters found
-  if url.count('&') >= 1 or url.count(';') > 0:
+  if (url.count('&') >= 1 or url.count(';') > 0) and not any(x in url for x in 
('.css', 'gen=css')):
     if config.debug:
       print url, 'with multiple query fields'
     return False
 
-  if any(x in url for x in ('MediaWiki:', 'Special:', 'Image:', 'Talk:', 
'User:', 'Help:')):
+  #if any(x in url for x in ('MediaWiki:', 'Special:', 'Image:', 'Talk:', 
'User:', 'Help:')):
+  if any(x in url for x in ('Special:', 'Image:', 'Talk:', 'User:', 'Help:')):
     if config.debug:
       print url, 'is a forbidden wiki page'
     return False
@@ -725,7 +741,6 @@
     return False
 
   # limit_parent support
-  nurl  = normalize_url(url)
   ncurl = normalize_url(config.rooturl)
   
   if config.limit_parent and not nurl.startswith(ncurl):
@@ -744,17 +759,19 @@
        
   return True
 
-def parse_html(doc, url, config):
+def parse_html(doc, url):
   """
   Returns (modified_doc, new_urls), where new_urls are absolute URLs for
   all links we want to spider in the HTML.
   """
+  global config
+  
   BEGIN_COMMENT_REPLACE = '<BEGINCOMMENT-' + str(random.random()) + '>'
   END_COMMENT_REPLACE   = '<ENDCOMMENT-' + str(random.random()) + '>'
 
   new_urls = []  
 
-  doc = pre_html_transform(doc, url, config)  
+  doc = pre_html_transform(doc, url)  
   # Temporarily "get rid" of comments so htmldata will find the URLs
   # in the funky "<!--[if" HTML hackery for IE.
   doc = doc.replace('<!--', BEGIN_COMMENT_REPLACE)
@@ -764,13 +781,13 @@
 
   for item in L:
     u = item.url
-    follow = should_follow(u, config)
+    follow = should_follow(u)
     if follow:
       if config.debug:
         print 'ACCEPTED   - ', u
       # Store url locally.
       new_urls += [u]
-      item.url = url_to_relative(u, url, config)
+      item.url = url_to_relative(u, url)
     else:
       if config.debug:
         print 'DENIED     - ', u
@@ -779,16 +796,16 @@
   newdoc = newdoc.replace(BEGIN_COMMENT_REPLACE, '<!--')
   newdoc = newdoc.replace(END_COMMENT_REPLACE, '-->')
 
-  newdoc = pos_html_transform(newdoc, config)
+  newdoc = pos_html_transform(newdoc)
   
   return (newdoc, new_urls)
   
 
-def run(config, out=sys.stdout):
+def run(out=sys.stdout):
   """
   Code interface.
   """
-  global conn, domain, counter, redir_cache
+  global conn, domain, counter, redir_cache, config
   
   if urlparse.urlparse(config.rooturl)[1].lower().endswith('wikipedia.org'):
     out.write('Please do not use robots with the Wikipedia site.\n')
@@ -811,33 +828,32 @@
   
   start = True
   while len(pending) > 0:
-    url = normalize_url(pending.pop())
-    if get_domain(url) != domain:
-      url = normalize_url(urlparse.urljoin(config.rooturl, url))
+    url = pending.pop()
+    nurl = normalize_url(url)
     
-    if url in redir_cache:
-      url = normalize_url(redir_cache[url])
+    if nurl in redir_cache:
+      nurl = redir_cache[nurl]
 
-    if url in complete:
+    if nurl in complete:
       if config.debug:
         print url, 'already processed'
       continue
  
-    complete.add(url)
-    filename = url_to_filename(url, config)
+    complete.add(nurl)
+    filename = url_to_filename(url)
 
     #this is needed for the first path as it doesn't know if it is a redirect 
or not in the begining
     #at this point all the content of redir_cache is relative to the start path
     if start:
       start = False
-      nurl = ''
-      for red in redir_cache.iterkeys():
-        nurl = normalize_url(red)
-        url_filename_cache[nurl] = filename
-        if nurl not in complete:
-          complete.add(nurl)
-      if nurl != '':
-        url = normalize_url(redir_cache[url])
+      aux_url = ''
+      for redir in redir_cache.iterkeys():
+        aux_url = normalize_url(redir)
+        url_filename_cache[aux_url] = filename
+        if aux_url not in complete:
+          complete.add(aux_url)
+      if aux_url != '':
+        nurl = normalize_url(redir_cache[nurl])
  
     if filename == '':
       continue
@@ -852,9 +868,9 @@
     new_urls = []
 
     if filename.endswith('.html'):
-      (doc, new_urls) = parse_html(doc, url, config)
+      (doc, new_urls) = parse_html(doc, nurl)
     elif filename.endswith('.css'):
-      (doc, new_urls) = parse_css(doc, url, config)
+      (doc, new_urls) = parse_css(doc, nurl)
 
     # Enqueue URLs that we haven't yet spidered.
     for u in new_urls:
@@ -947,6 +963,7 @@
   """
   Command line interface.
   """
+  global config
   try:
     (opts, args) = getopt.gnu_getopt(sys.argv[1:], 'fsdl:t:b:i:',
                    ['force', 'no-flatten', 'no-clean',
@@ -1003,7 +1020,7 @@
       config.index          = arg
 
   # Run program
-  run(config)
+  run()
 
 
 if __name__ == '__main__':


------------------------------------------------------------------------------
Register Now for Creativity and Technology (CaT), June 3rd, NYC. CaT 
is a gathering of tech-side developers & brand creativity professionals. Meet
the minds behind Google Creative Lab, Visual Complexity, Processing, & 
iPhoneDevCamp as they present alongside digital heavyweights like Barbarian 
Group, R/GA, & Big Spaceship. http://p.sf.net/sfu/creativitycat-com 
_______________________________________________
Audacity-cvs mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/audacity-cvs

[Audacity-cvs] audacity-src/scripts/mw2html_audacity mw2html.py, 1.2, 1.3

Reply via email to