[docs] import tool for current docs

Bruno Dumon Mon, 13 Jun 2005 12:36:14 -0700

Thought I'd help out a bit with the documentation effort by writing a
tool to import the current documentation.


The facts:
 * it's written in javascript (rhino) to lower the hacking-barrier
 * it imports the xdocs starting from the source tree. It translates
them to html with an XSL and cleans them up with the htmlcleaner (so
that they would look as if edited through the daisy-wiki)
 * it imports the images
 * while creating the documents and images, a mapping between original
filenames and daisy document IDs is kept, and in a second pass the links
in all documents are translated.

What still needs to be done & issues:
 * the current XSL just contains the bare minimum to get something done
(I focussed my efforts on the import.js), someone needs to
systematically look at all the tags in the document-v10 dtd to see if
they're handled correctly (e.g. <dl> is still todo).
 * testing/verifying of the results
 * only document-v10 based documents are supported, if necessary support
for faqs and others could be added (= mainly xsl work)
 * Daisy doesn't have a <code>-like tag, we need to decide what to do
with this. Daisy doesn't have this since the Mozilla/IE editor APIs
don't support the creation of this type of tag.

How to use:
 * save the two attached files somewhere
 * customize the configuration variables on top of import.js
 * download/install a daisy distro (version 1.3-M2)
 * run with "$DAISY_HOME/bin/daisy-js import.js"

-- 
Bruno Dumon                             http://outerthought.org/
Outerthought - Open Source, Java & XML Competence Support Center
[EMAIL PROTECTED]                          [EMAIL PROTECTED]

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform";>
  <xsl:template match="/">
    <xsl:apply-templates select="document/body"/>
  </xsl:template>

  <xsl:template match="document/body">
    <html>
      <body>
        <xsl:apply-templates/>
      </body>
    </html>
  </xsl:template>

  <xsl:template match="s1">
    <h1><xsl:value-of select="@title"/></h1>
    <xsl:apply-templates/>
  </xsl:template>

  <xsl:template match="s2">
    <h2><xsl:value-of select="@title"/></h2>
    <xsl:apply-templates/>
  </xsl:template>

  <xsl:template match="s3">
    <h3><xsl:value-of select="@title"/></h3>
    <xsl:apply-templates/>
  </xsl:template>

  <xsl:template match="s4">
    <h4><xsl:value-of select="@title"/></h4>
    <xsl:apply-templates/>
  </xsl:template>

  <xsl:template match="sl">
    <ul><xsl:apply-templates/></ul>
  </xsl:template>

  <xsl:template match="link|connect|jump|fork">
    <a><xsl:copy-of select="@*"/><xsl:apply-templates/></a>
  </xsl:template>

  <xsl:template match="figure">
    <img><xsl:copy-of select="@*"/><xsl:apply-templates/></img>
  </xsl:template>

  <xsl:template match="source">
    <pre><xsl:apply-templates/></pre>
  </xsl:template>
  
  <xsl:template match="note">
    <p class="note"><xsl:apply-templates/></p>
  </xsl:template>

  <xsl:template match="fixme">
    <p class="fixme"><xsl:apply-templates/></p>
  </xsl:template>

  <xsl:template match="@*|node()">
    <xsl:copy>
      <xsl:apply-templates select="@*|node()"/>
    </xsl:copy>
  </xsl:template>

</xsl:stylesheet>

importPackage(java.io);
importPackage(Packages.org.outerj.daisy.repository);
importClass(Packages.org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager);

// Configuration
var docRoot = "/home/bruno/oss/cocoon-2.1.7/src/documentation/xdocs/"
var imagesRoot = "/home/bruno/oss/cocoon-2.1.7/src/documentation/images/";
var xsl = "document-to-daisyhtml.xsl";
var htmlcleanerXml = 
"/home/bruno/projects/daisy/trunk/daisy/applications/daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml";
var repositoryURL = "http://localhost:9263";;
var repoUser = "testuser"; // note: must be a user with Administrator rights
var repoPwd = "testuser";
var collection = "coolsite"; // collection in which to put the documents

// connect to repository
var repositoryManager = new RemoteRepositoryManager(repositoryURL, new 
Credentials(repoUser, repoPwd));
var repository = repositoryManager.getRepository(new Credentials(repoUser, 
repoPwd));
var activeRoleIds = java.lang.reflect.Array.newInstance(java.lang.Long.TYPE, 1);
activeRoleIds[0] = 1;
repository.setActiveRoleIds(activeRoleIds);

// build stylesheet template
var stylesheetSource = new 
Packages.javax.xml.transform.stream.StreamSource(xsl);
var transformerFactory = 
Packages.javax.xml.transform.TransformerFactory.newInstance();
var stylesheetTemplate = transformerFactory.newTemplates(stylesheetSource);

// build html cleaner template
var htmlcleanerconf = new java.io.File(htmlcleanerXml);
if (!htmlcleanerconf.exists()) {
    print("HTML Cleaner configuration file not found at: " + 
htmlcleanerconf.getAbsolutePath());
      quit();
}
var htmlCleanerFactory = new 
Packages.org.outerj.daisy.htmlcleaner.HtmlCleanerFactory();
var htmlCleanerTemplate = htmlCleanerFactory.buildTemplate(new 
Packages.org.xml.sax.InputSource(htmlcleanerconf.getAbsolutePath()));

// mapping of daisy ids and files
var docids = []; // list of created document ids (redundant)
var map = new Object(); // mapping of document file URI (with .html extension 
instead of .xml) to created daisy document ID
var imagesMap = new Object(); // mapping of image file name to created daisy 
document ID


// fetch collection
var daisyCollection = 
repository.getCollectionManager().getCollection(collection, false);

// begin!
importImages();
importXdocsRecursive(new File(docRoot));
postProcessLinks();

// Recursively processes a directory containing xdoc documents.
function importXdocsRecursive(dir) {
  var files = dir.listFiles();
  for (var i = 0; i < files.length; i++) {
    var file = files[i];
    if (file.getName().endsWith(".xml") && file.getName() != "book.xml") {
      var document = parse(file.getAbsolutePath());
      print("parsed " + file.getAbsolutePath());
      var title = extractTitle(document, file.getName());
      var doctype = document.getDoctype();
      var usedoc = false;
      if (doctype == null) {
        print("SKIPPED file because it has no doctype: " + 
file.getAbsolutePath());
      } else if (doctype.getPublicId() == "-//APACHE//DTD Documentation 
V1.0//EN") {
        document = transformToDaisyHtml(document);
        usedoc = true;
      } else {
        print("SKIPPED file because it has an unsupported doctype: " + 
file.getAbsolutePath());
      }

      if (usedoc) {
        handleLinks(document, file);
        var serializedDoc = serializeDoc(document);

        // create doc in daisy
        var daisyDoc = repository.createDocument(title, "SimpleDocument");
        daisyDoc.setPart("SimpleDocumentContent", "text/xml", serializedDoc);
        daisyDoc.addToCollection(daisyCollection);
        daisyDoc.save();

        map[changeExtension(file.toURI().toString())] = daisyDoc.getId();
        docids.push(daisyDoc.getId());
      }
    } else if (file.isDirectory()) {
      importXdocsRecursive(file);
    } else {
      print("SKIPPED file " + file.getAbsolutePath());
    }
  }
}

// change .xml to .html extension
function changeExtension(uri) {
  if (uri.endsWith(".xml")) {
    return uri.substring(0, uri.length() - 4) + ".html";
  } else {
    return uri;
  }
}

// parses an xml file or stream
function parse(data) {
  // directly construct xerces parser since this allows to set some useful 
options, such as ignoring the dtd
  var parser = new Packages.org.apache.xerces.parsers.DOMParser();
  
parser.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd";,
 false);
  
parser.setFeature("http://apache.org/xml/features/dom/include-ignorable-whitespace";,
 false);
  parser.setFeature("http://apache.org/xml/features/create-cdata-nodes";, false);
  parser.setFeature("http://xml.org/sax/features/external-general-entities";, 
false);
  parser.setFeature("http://xml.org/sax/features/external-parameter-entities";, 
false);
  parser.parse(new Packages.org.xml.sax.InputSource(data));
  return parser.getDocument();
}

// extracts the document/header/title
function extractTitle(document, defaultTitle) {
  var titles = document.getElementsByTagName("title");
  if (titles.getLength() < 1)
    return defaultTitle;

  // hooray for dom!
  var titleString = "";
  var titleEl = titles.item(0);
  var children = titleEl.childNodes;
  for (var i = 0; i < children.getLength(); i++) {
    if (children.item(i).nodeType == 3) // 3 = text node type
      titleString += children.item(0).getData();
  }
  return titleString;
}

// translates an xdoc document to daisy html using xsl
function transformToDaisyHtml(document) {
  var transformer = stylesheetTemplate.newTransformer();
  var source = new Packages.javax.xml.transform.dom.DOMSource(document);
  var result = new Packages.javax.xml.transform.dom.DOMResult();
  transformer.transform(source, result);
  return result.getNode();
}

// absolutizes relative URLs in the file
function handleLinks(document, file) {
  var docURI = file.toURI();
  var links = document.getElementsByTagName("a");
  for (var i = 0; i < links.getLength(); i++) {
    var hrefAttr = links.item(i).getAttributeNodeNS(null, "href");
    if (hrefAttr != null) {
      try {
        var absURI = docURI.resolve(hrefAttr.getValue());
        hrefAttr.setValue(absURI.toString());
      } catch (e) {
        print("error resolving uri, skipping: " + hrefAttr.getValue());
      }
    }
  }
}

// serialize dom tree and pull it through HTML cleaner to get exact HTML as it 
would have been saved in the daisy wiki
function serializeDoc(document) {
  var transformer = transformerFactory.newTransformer();
  var source = new Packages.javax.xml.transform.dom.DOMSource(document);
  var writer = new java.io.StringWriter();
  var result = new Packages.javax.xml.transform.stream.StreamResult(writer);
  transformer.transform(source, result);
  var xml = writer.toString();
  return htmlCleanerTemplate.newHtmlCleaner().cleanToByteArray(xml);
}

// for all created documents, translates links to other created documents
function postProcessLinks() {
  
print("============================================================================");
  print("Will now translate links.");
  
print("============================================================================");
  for (var i = 0; i < docids.length; i++) {
    print("Working on doc " + docids[i] + " (" + i + "/" + docids.length + ")");
    var daisyDoc = repository.getDocument(docids[i], true);
    var xmldoc = 
parse(daisyDoc.getPart("SimpleDocumentContent").getDataStream());
    var didSomething = translateLinks(xmldoc);
    if (didSomething) {
      var serializedDoc = serializeDoc(xmldoc);
      daisyDoc.setPart("SimpleDocumentContent", "text/xml", serializedDoc);
      daisyDoc.save();
    }
  }
}

function translateLinks(document) {
  var didSomething = false;

  // normal links
  var links = document.getElementsByTagName("a");
  for (var i = 0; i < links.getLength(); i++) {
    var hrefAttr = links.item(i).getAttributeNodeNS(null, "href");
    if (hrefAttr != null) {
      var id = map[hrefAttr.getValue()];
      if (id != null) {
        hrefAttr.setValue("daisy:" + id);
        didSomething = true;
        print("translated link");
      }
    }
  }

  // images
  var images = document.getElementsByTagName("img");
  for (var i = 0; i < images.getLength(); i++) {
    var srcAttr = images.item(i).getAttributeNodeNS(null, "src");
    if (srcAttr != null) {
      var src = srcAttr.getValue();
      if (src.startsWith("images/")) {
        src = src.substring("images/".length);
        print ("looking up image " + src);
        var id = imagesMap[src];
        if (id != null) {
          srcAttr.setValue("daisy:" + id);
          didSomething = true;
          print("translated image link");
        }
      }
    }
  }

  return didSomething;
}

// imports the images
function importImages() {
  print("====================================================");
  print("Importing images");
  print("====================================================");
  var imagesDir = new File(imagesRoot);
  var files = imagesDir.listFiles();
  for (var i = 0; i < files.length; i++) {
    var file = files[i];
    var fileName = file.getName();
    if (fileName.endsWith(".jpg") || fileName.endsWith(".gif") || 
fileName.endsWith(".png")) {
      print("Importing image " + file.getName());
      var docName = fileName.substring(0, fileName.length() - 4);
      var daisyDoc = repository.createDocument(docName, "Image");
      var mimeType;
      if (fileName.endsWith(".jpg"))
        mimeType = "image/jpeg";
      else if (fileName.endsWith(".png"))
        mimeType = "image/png";
      else if (fileName.endsWith(".gif"))
        mimeType = "image/gif";
      daisyDoc.setPart("ImageData", mimeType, new FilePartDataSource(file));
      daisyDoc.setPartFileName("ImageData", file.getName());
      daisyDoc.addToCollection(daisyCollection);
      daisyDoc.save();
      imagesMap[file.getName()] = daisyDoc.getId();
      print("stored image " + file.getName());
    }
  }
}

[docs] import tool for current docs

Reply via email to