master] Parse XHTML for EPUB metadata.

Josh Hieronymus Wed, 11 Sep 2013 12:37:36 -0700

The branch, epub/master, has been updated.

- Log -----------------------------------------------------------------


commit e173486ac219a3f8ebdab13cbcd725f09deb0305
Author: Josh Hieronymus <[email protected]>
Date:   Wed Sep 11 13:43:38 2013 -0400

    Parse XHTML for EPUB metadata.
    
    The values for the EPUB title, language, identifier, identifier
    scheme, author name, and author name in file-as order are now
    extracted from the XHTML file instead of using default values
    (which are still available when a value cannot be extracted).

diff --git a/lib/scripts/epub/epub_oebps.py b/lib/scripts/epub/epub_oebps.py
index 91e5bb5..824a3cd 100644
--- a/lib/scripts/epub/epub_oebps.py
+++ b/lib/scripts/epub/epub_oebps.py
@@ -13,6 +13,11 @@ import os.path
 import xml.etree.ElementTree as ET
 import zipfile
 
+from epub_xhtml_utilities import extract_author
+from epub_xhtml_utilities import extract_author_file_as
+from epub_xhtml_utilities import extract_identifier
+from epub_xhtml_utilities import extract_identifier_scheme
+from epub_xhtml_utilities import extract_language
 from epub_xhtml_utilities import extract_title
 
 class Oebps(object):
@@ -78,7 +83,6 @@ class Oebps(object):
         self.toc.add_to_epub_file(epub_file)
 
 # TODO: (Potentially) split XHTML file into smaller files
-# TODO: Parse XHTML file for language, identifier, author, etc.
 class Content(object):
     FILENAME = "content.opf"
     XML_VERSION = "1.0"
@@ -130,12 +134,6 @@ class Content(object):
         IDENTIFIER_TAG = "{{{}}}identifier".format(XMLNS)
         CREATOR_TAG = "{{{}}}creator".format(XMLNS)
         # Defaults for information that can't be extracted successfully.
-        TITLE = "A LyX-Created EPUB Book"
-        LANGUAGE = "en"
-        IDENTIFIER = "123456789X"
-        IDENTIFIER_SCHEME = "ISBN"
-        CREATOR = "LyX User"
-        CREATOR_FILE_AS = "User, LyX"
         CREATOR_ROLE = "aut"
         def __init__(self, xhtml_root):
             self.root = 
ET.Element("{{{}}}metadata".format(Content.PACKAGE_XMLNS))
@@ -143,17 +141,19 @@ class Content(object):
             self.title.text = extract_title(xhtml_root)
             self.root.append(self.title)
             self.language = ET.Element(self.LANGUAGE_TAG)
-            self.language.text = self.LANGUAGE
+            self.language.text = extract_language(xhtml_root)
             self.root.append(self.language)
             self.identifier = ET.Element(self.IDENTIFIER_TAG)
             self.identifier.set("id", Content.PACKAGE_UNIQUE_IDENTIFIER)
-            self.identifier.set("{{{}}}scheme".format(Content.PACKAGE_XMLNS), 
self.IDENTIFIER_SCHEME)
-            self.identifier.text = self.IDENTIFIER
+            identifier_scheme = extract_identifier_scheme(xhtml_root)
+            self.identifier.set("{{{}}}scheme".format(Content.PACKAGE_XMLNS), 
identifier_scheme)
+            self.identifier.text = extract_identifier(xhtml_root)
             self.root.append(self.identifier)
             self.creator = ET.Element(self.CREATOR_TAG)
-            self.creator.set("{{{}}}file-as".format(Content.PACKAGE_XMLNS), 
self.CREATOR_FILE_AS)
+            author_file_as = extract_author_file_as(xhtml_root)
+            self.creator.set("{{{}}}file-as".format(Content.PACKAGE_XMLNS), 
author_file_as)
             self.creator.set("{{{}}}role".format(Content.PACKAGE_XMLNS), 
self.CREATOR_ROLE)
-            self.creator.text = self.CREATOR
+            self.creator.text = extract_author(xhtml_root)
             self.root.append(self.creator)
 
     class Manifest(object):
@@ -226,20 +226,19 @@ class Content(object):
             self.root.append(itemref)
 
 # TODO: (Potentially) add NavPoints for skipping around in file, bookmarks, 
etc.
-# TODO: Parse XHTML file for title, language, identifier, author, etc.
 class Toc(object):
     FILENAME = "toc.ncx"
     XML_VERSION = "1.0"
     XML_ENCODING = "UTF-8"
     NCX_VERSION = "2005-1"
-    NCX_XML_LANG ="en"
     NCX_XMLNS = "http://www.daisy.org/z3986/2005/ncx/";
     
     def __init__(self, pathname, xhtml_name, xhtml_root):
         self.pathname = pathname
         self.root = ET.Element("ncx")
         self.root.set("version", self.NCX_VERSION)
-        self.root.set("xml:lang", self.NCX_XML_LANG)
+        xml_lang = extract_language(xhtml_root)
+        self.root.set("xml:lang", xml_lang)
         self.root.set("xmlns", self.NCX_XMLNS)
         self.head = self.Head(xhtml_root)
         self.root.append(self.head.root)
@@ -272,16 +271,15 @@ class Toc(object):
         archive_name = os.path.normpath(os.path.join(self.pathname, 
self.FILENAME))
         epub_file.write(self.FILENAME, archive_name, zipfile.ZIP_DEFLATED)
 
-    # TODO: extract uid, (possibly) depth from xhtml_root
+    # TODO: (possibly) extract depth from xhtml_root
     class Head(object):
         DEPTH = "1" # depth of the table of contents--some e-readers don't 
allow > 1
         TOTAL_PAGE_COUNT = "0" # only used with navigable pages
         MAX_PAGE_NUMBER = "0" # only used with navigable pages
         def __init__(self, xhtml_root):
             self.root = ET.Element("head")
-            # should parse xhtml_root for uid
             # uid needs to match opf identifier
-            uid = "123456789X"
+            uid = extract_identifier(xhtml_root)
             self.add_meta("dtb:uid", uid)
             self.add_meta("dtb:depth", self.DEPTH)
             self.add_meta("dtb:totalPageCount", self.TOTAL_PAGE_COUNT)
@@ -302,15 +300,13 @@ class Toc(object):
             text.text = title
             self.root.append(text)
 
-    # TODO: extract author from xhtml_root
     class DocAuthor(object):
         def __init__(self, xhtml_root):
             self.root = ET.Element("docAuthor")
             text = ET.Element("text")
-            # should parse xhtml_root for author
             # author should match opf author
             # author should appear in file-as format
-            author = "User, LyX"
+            author = extract_author_file_as(xhtml_root)
             text.text = author
             self.root.append(text)
 
diff --git a/lib/scripts/epub/epub_xhtml_utilities.py 
b/lib/scripts/epub/epub_xhtml_utilities.py
index d46817e..3252e4a 100644
--- a/lib/scripts/epub/epub_xhtml_utilities.py
+++ b/lib/scripts/epub/epub_xhtml_utilities.py
@@ -66,66 +66,66 @@ def extract_language(xhtml_root):
     language_xhtml_class = "epub-language"
     language = extract_attribute(xhtml_root, language_xhtml_class)
     if language is None:
-               language = _DEFAULT_LANGUAGE
-       
-       return language
+        language = _DEFAULT_LANGUAGE
+    
+    return language
 
 def extract_identifier(xhtml_root):
-       """Extract an EPUB identifier from an XHTML file and return it.
+    """Extract an EPUB identifier from an XHTML file and return it.
     
     Keyword Arguments:
     xhtml_root -- an ElementTree Element representing the root element of a LyX
         document's exported XHTML file
 
     """
-       identifier_xhtml_class = "epub-identifier"
+    identifier_xhtml_class = "epub-identifier"
     identifier = extract_attribute(xhtml_root, identifier_xhtml_class)
     if identifier is None:
-               identifier = _DEFAULT_IDENTIFIER
-       
-       return identifier
+        identifier = _DEFAULT_IDENTIFIER
+    
+    return identifier
 
 def extract_identifier_scheme(xhtml_root):
-       """Extract an identifier scheme from an XHTML file and return it.
+    """Extract an identifier scheme from an XHTML file and return it.
     
     Keyword Arguments:
     xhtml_root -- an ElementTree Element representing the root element of a LyX
         document's exported XHTML file
 
     """
-       identifier_scheme_xhtml_class = "epub-identifier-scheme"
+    identifier_scheme_xhtml_class = "epub-identifier-scheme"
     identifier_scheme = extract_attribute(xhtml_root, 
identifier_scheme_xhtml_class)
     if identifier_scheme is None:
-               identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME
-       
-       return identifier_scheme
+        identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME
+    
+    return identifier_scheme
 
 def extract_author(xhtml_root):
-       """Extract an EPUB author (in reading order) from an XHTML file and 
return it.
+    """Extract an EPUB author (in reading order) from an XHTML file and return 
it.
     
     Keyword Arguments:
     xhtml_root -- an ElementTree Element representing the root element of a LyX
         document's exported XHTML file
 
     """
-       author_xhtml_class = "epub-author-reading-order"
+    author_xhtml_class = "epub-author-reading-order"
     author = extract_attribute(xhtml_root, author_xhtml_class)
     if author is None:
-               author = _DEFAULT_AUTHOR
-       
-       return author
+        author = _DEFAULT_AUTHOR
+    
+    return author
 
 def extract_author_file_as(xhtml_root):
-       """Extract an EPUB author (in file-as order) from an XHTML file and 
return it.
+    """Extract an EPUB author (in file-as order) from an XHTML file and return 
it.
     
     Keyword Arguments:
     xhtml_root -- an ElementTree Element representing the root element of a LyX
         document's exported XHTML file
 
     """
-       author_file_as_xhtml_class = "epub-author-file-as-order"
+    author_file_as_xhtml_class = "epub-author-file-as-order"
     author_file_as = extract_attribute(xhtml_root, author_file_as_xhtml_class)
     if author_file_as is None:
-               author_file_as = _DEFAULT_AUTHOR_FILE_AS
-       
-       return author_file_as
+        author_file_as = _DEFAULT_AUTHOR_FILE_AS
+    
+    return author_file_as

commit f9ae3cadb6c1664ea47d5ba43843d65b95f0602d
Author: Josh Hieronymus <[email protected]>
Date:   Wed Sep 11 12:41:46 2013 -0400

    Implement more functions to extract EPUB metadata from XHTML.

diff --git a/lib/scripts/epub/epub_xhtml_utilities.py 
b/lib/scripts/epub/epub_xhtml_utilities.py
index 992b0cc..d46817e 100644
--- a/lib/scripts/epub/epub_xhtml_utilities.py
+++ b/lib/scripts/epub/epub_xhtml_utilities.py
@@ -13,21 +13,119 @@ import xml.etree.ElementTree as ET
 
 # XML namespace used in the exported XHTML file
 _XHTML_XMLNS = "http://www.w3.org/1999/xhtml";
-# a title to be returned when the document's title cannot be extracted
+# metadata values to be used when the document's metadata cannot be extracted
 _DEFAULT_TITLE = "A LyX-Created EPUB Book"
+_DEFAULT_LANGUAGE = "en"
+_DEFAULT_IDENTIFIER = "123456789X"
+_DEFAULT_IDENTIFIER_SCHEME = "ISBN"
+_DEFAULT_AUTHOR = "LyX User"
+_DEFAULT_AUTHOR_FILE_AS = "User, LyX"
+
+def extract_attribute(xhtml_root, attribute_xhtml_class):
+    """Extract an EPUB metadata attribute from an XHTML file and return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+    
+    attribute_xhtml_class -- the value of the class attribute of the XHTML 
element
+        containing the attribute
+
+    """
+    attribute_container_xpath_schema = ".//{{{0}}}div[@class='epub-metadata 
{1}']/{{{0}}}a"
+    attribute_container_xpath = 
attribute_container_xpath_schema.format(_XHTML_XMLNS, attribute_xhtml_class)
+    attribute_container = xhtml_root.find(attribute_container_xpath)
+    if attribute_container is not None:
+        return attribute_container.tail
+    
+    return None
 
 def extract_title(xhtml_root):
-    """Extract a LyX document's title from its exported XHTML file and return 
it.
+    """Extract an EPUB title from an XHTML file and return it.
     
     Keyword Arguments:
     xhtml_root -- an ElementTree Element representing the root element of a LyX
         document's exported XHTML file
 
     """
-    title = _DEFAULT_TITLE
-    title_container_xpath_schema = ".//{{{0}}}h1[@class='title']/{{{0}}}a"
-    title_container_xpath = title_container_xpath_schema.format(_XHTML_XMLNS)
-    title_container = xhtml_root.find(title_container_xpath)
-    if title_container is not None:
-        title = title_container.tail
+    title_xhtml_class = "epub-title"
+    title = extract_attribute(xhtml_root, title_xhtml_class)
+    if title is None:
+        title = _DEFAULT_TITLE
+    
     return title
+
+def extract_language(xhtml_root):
+    """Extract an EPUB language from an XHTML file and return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+
+    """
+    language_xhtml_class = "epub-language"
+    language = extract_attribute(xhtml_root, language_xhtml_class)
+    if language is None:
+               language = _DEFAULT_LANGUAGE
+       
+       return language
+
+def extract_identifier(xhtml_root):
+       """Extract an EPUB identifier from an XHTML file and return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+
+    """
+       identifier_xhtml_class = "epub-identifier"
+    identifier = extract_attribute(xhtml_root, identifier_xhtml_class)
+    if identifier is None:
+               identifier = _DEFAULT_IDENTIFIER
+       
+       return identifier
+
+def extract_identifier_scheme(xhtml_root):
+       """Extract an identifier scheme from an XHTML file and return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+
+    """
+       identifier_scheme_xhtml_class = "epub-identifier-scheme"
+    identifier_scheme = extract_attribute(xhtml_root, 
identifier_scheme_xhtml_class)
+    if identifier_scheme is None:
+               identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME
+       
+       return identifier_scheme
+
+def extract_author(xhtml_root):
+       """Extract an EPUB author (in reading order) from an XHTML file and 
return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+
+    """
+       author_xhtml_class = "epub-author-reading-order"
+    author = extract_attribute(xhtml_root, author_xhtml_class)
+    if author is None:
+               author = _DEFAULT_AUTHOR
+       
+       return author
+
+def extract_author_file_as(xhtml_root):
+       """Extract an EPUB author (in file-as order) from an XHTML file and 
return it.
+    
+    Keyword Arguments:
+    xhtml_root -- an ElementTree Element representing the root element of a LyX
+        document's exported XHTML file
+
+    """
+       author_file_as_xhtml_class = "epub-author-file-as-order"
+    author_file_as = extract_attribute(xhtml_root, author_file_as_xhtml_class)
+    if author_file_as is None:
+               author_file_as = _DEFAULT_AUTHOR_FILE_AS
+       
+       return author_file_as

commit 5b58881215de9a0bcafbc23216815003e99f78b0
Author: Josh Hieronymus <[email protected]>
Date:   Wed Sep 11 00:07:39 2013 -0400

    Re-add accidentally removed instances of 'parse' in comments.

diff --git a/lib/scripts/epub/epub_oebps.py b/lib/scripts/epub/epub_oebps.py
index a9296f0..91e5bb5 100644
--- a/lib/scripts/epub/epub_oebps.py
+++ b/lib/scripts/epub/epub_oebps.py
@@ -279,7 +279,7 @@ class Toc(object):
         MAX_PAGE_NUMBER = "0" # only used with navigable pages
         def __init__(self, xhtml_root):
             self.root = ET.Element("head")
-            # should xhtml_root for uid
+            # should parse xhtml_root for uid
             # uid needs to match opf identifier
             uid = "123456789X"
             self.add_meta("dtb:uid", uid)
@@ -307,7 +307,7 @@ class Toc(object):
         def __init__(self, xhtml_root):
             self.root = ET.Element("docAuthor")
             text = ET.Element("text")
-            # should xhtml_root for author
+            # should parse xhtml_root for author
             # author should match opf author
             # author should appear in file-as format
             author = "User, LyX"

commit cbafb4d466bd741b836d7bb59b8de48b7aa6d754
Author: Josh Hieronymus <[email protected]>
Date:   Tue Sep 10 23:46:52 2013 -0400

    Add more paragraph styles to EPUB Metadata module.

diff --git a/lib/layouts/epub-metadata.module b/lib/layouts/epub-metadata.module
index ed0c59b..3b75b22 100644
--- a/lib/layouts/epub-metadata.module
+++ b/lib/layouts/epub-metadata.module
@@ -24,11 +24,46 @@ Style "EPUB Metadata"
        HTMLAttr "class='epub-metadata'"
 End
 
-Style "EPUB Author"
+Style "EPUB Title"
        CopyStyle       "EPUB Metadata"
-       LatexName       EPUB-Author
-       LabelString     "EPUB Author:"
-       HTMLAttr "class='epub-metadata epub-author'"
+       LatexName       EPUB-Title
+       LabelString     "EPUB Title:"
+       HTMLAttr "class='epub-metadata epub-title'"
+End
+
+Style "EPUB Language"
+       CopyStyle       "EPUB Metadata"
+       LatexName       EPUB-Language
+       LabelString     "EPUB Language:"
+       HTMLAttr "class='epub-metadata epub-language'"
+End
+
+Style "EPUB Identifier"
+       CopyStyle       "EPUB Metadata"
+       LatexName       EPUB-Identifier
+       LabelString     "EPUB Identifier:"
+       HTMLAttr "class='epub-metadata epub-identifier'"
+End
+
+Style "EPUB Identifier Scheme"
+       CopyStyle       "EPUB Metadata"
+       LatexName       EPUB-Identifier-Scheme
+       LabelString     "EPUB Identifier Scheme:"
+       HTMLAttr "class='epub-metadata epub-identifier-scheme'"
+End
+
+Style "EPUB Author (reading order)"
+       CopyStyle       "EPUB Metadata"
+       LatexName       EPUB-Author-reading-order
+       LabelString     "EPUB Author (reading order):"
+       HTMLAttr "class='epub-metadata epub-author-reading-order'"
+End
+
+Style "EPUB Author (file-as order)"
+       CopyStyle       "EPUB Metadata"
+       LatexName       EPUB-Author-file-as-order
+       LabelString     "EPUB Author (file-as order):"
+       HTMLAttr "class='epub-metadata epub-author-file-as-order'"
 End
 
 NoStyle "EPUB Metadata"

-----------------------------------------------------------------------

Summary of changes:
 lib/layouts/epub-metadata.module         |   43 ++++++++++-
 lib/scripts/epub/epub_oebps.py           |   38 +++++------
 lib/scripts/epub/epub_xhtml_utilities.py |  114 +++++++++++++++++++++++++++--
 3 files changed, 162 insertions(+), 33 deletions(-)


hooks/post-receive
-- 
Repositories for GSOC work

[LyX GSoC/epub/master] Parse XHTML for EPUB metadata.

Reply via email to