The branch, epub/master, has been updated. - Log -----------------------------------------------------------------
commit e173486ac219a3f8ebdab13cbcd725f09deb0305 Author: Josh Hieronymus <[email protected]> Date: Wed Sep 11 13:43:38 2013 -0400 Parse XHTML for EPUB metadata. The values for the EPUB title, language, identifier, identifier scheme, author name, and author name in file-as order are now extracted from the XHTML file instead of using default values (which are still available when a value cannot be extracted). diff --git a/lib/scripts/epub/epub_oebps.py b/lib/scripts/epub/epub_oebps.py index 91e5bb5..824a3cd 100644 --- a/lib/scripts/epub/epub_oebps.py +++ b/lib/scripts/epub/epub_oebps.py @@ -13,6 +13,11 @@ import os.path import xml.etree.ElementTree as ET import zipfile +from epub_xhtml_utilities import extract_author +from epub_xhtml_utilities import extract_author_file_as +from epub_xhtml_utilities import extract_identifier +from epub_xhtml_utilities import extract_identifier_scheme +from epub_xhtml_utilities import extract_language from epub_xhtml_utilities import extract_title class Oebps(object): @@ -78,7 +83,6 @@ class Oebps(object): self.toc.add_to_epub_file(epub_file) # TODO: (Potentially) split XHTML file into smaller files -# TODO: Parse XHTML file for language, identifier, author, etc. class Content(object): FILENAME = "content.opf" XML_VERSION = "1.0" @@ -130,12 +134,6 @@ class Content(object): IDENTIFIER_TAG = "{{{}}}identifier".format(XMLNS) CREATOR_TAG = "{{{}}}creator".format(XMLNS) # Defaults for information that can't be extracted successfully. - TITLE = "A LyX-Created EPUB Book" - LANGUAGE = "en" - IDENTIFIER = "123456789X" - IDENTIFIER_SCHEME = "ISBN" - CREATOR = "LyX User" - CREATOR_FILE_AS = "User, LyX" CREATOR_ROLE = "aut" def __init__(self, xhtml_root): self.root = ET.Element("{{{}}}metadata".format(Content.PACKAGE_XMLNS)) @@ -143,17 +141,19 @@ class Content(object): self.title.text = extract_title(xhtml_root) self.root.append(self.title) self.language = ET.Element(self.LANGUAGE_TAG) - self.language.text = self.LANGUAGE + self.language.text = extract_language(xhtml_root) self.root.append(self.language) self.identifier = ET.Element(self.IDENTIFIER_TAG) self.identifier.set("id", Content.PACKAGE_UNIQUE_IDENTIFIER) - self.identifier.set("{{{}}}scheme".format(Content.PACKAGE_XMLNS), self.IDENTIFIER_SCHEME) - self.identifier.text = self.IDENTIFIER + identifier_scheme = extract_identifier_scheme(xhtml_root) + self.identifier.set("{{{}}}scheme".format(Content.PACKAGE_XMLNS), identifier_scheme) + self.identifier.text = extract_identifier(xhtml_root) self.root.append(self.identifier) self.creator = ET.Element(self.CREATOR_TAG) - self.creator.set("{{{}}}file-as".format(Content.PACKAGE_XMLNS), self.CREATOR_FILE_AS) + author_file_as = extract_author_file_as(xhtml_root) + self.creator.set("{{{}}}file-as".format(Content.PACKAGE_XMLNS), author_file_as) self.creator.set("{{{}}}role".format(Content.PACKAGE_XMLNS), self.CREATOR_ROLE) - self.creator.text = self.CREATOR + self.creator.text = extract_author(xhtml_root) self.root.append(self.creator) class Manifest(object): @@ -226,20 +226,19 @@ class Content(object): self.root.append(itemref) # TODO: (Potentially) add NavPoints for skipping around in file, bookmarks, etc. -# TODO: Parse XHTML file for title, language, identifier, author, etc. class Toc(object): FILENAME = "toc.ncx" XML_VERSION = "1.0" XML_ENCODING = "UTF-8" NCX_VERSION = "2005-1" - NCX_XML_LANG ="en" NCX_XMLNS = "http://www.daisy.org/z3986/2005/ncx/" def __init__(self, pathname, xhtml_name, xhtml_root): self.pathname = pathname self.root = ET.Element("ncx") self.root.set("version", self.NCX_VERSION) - self.root.set("xml:lang", self.NCX_XML_LANG) + xml_lang = extract_language(xhtml_root) + self.root.set("xml:lang", xml_lang) self.root.set("xmlns", self.NCX_XMLNS) self.head = self.Head(xhtml_root) self.root.append(self.head.root) @@ -272,16 +271,15 @@ class Toc(object): archive_name = os.path.normpath(os.path.join(self.pathname, self.FILENAME)) epub_file.write(self.FILENAME, archive_name, zipfile.ZIP_DEFLATED) - # TODO: extract uid, (possibly) depth from xhtml_root + # TODO: (possibly) extract depth from xhtml_root class Head(object): DEPTH = "1" # depth of the table of contents--some e-readers don't allow > 1 TOTAL_PAGE_COUNT = "0" # only used with navigable pages MAX_PAGE_NUMBER = "0" # only used with navigable pages def __init__(self, xhtml_root): self.root = ET.Element("head") - # should parse xhtml_root for uid # uid needs to match opf identifier - uid = "123456789X" + uid = extract_identifier(xhtml_root) self.add_meta("dtb:uid", uid) self.add_meta("dtb:depth", self.DEPTH) self.add_meta("dtb:totalPageCount", self.TOTAL_PAGE_COUNT) @@ -302,15 +300,13 @@ class Toc(object): text.text = title self.root.append(text) - # TODO: extract author from xhtml_root class DocAuthor(object): def __init__(self, xhtml_root): self.root = ET.Element("docAuthor") text = ET.Element("text") - # should parse xhtml_root for author # author should match opf author # author should appear in file-as format - author = "User, LyX" + author = extract_author_file_as(xhtml_root) text.text = author self.root.append(text) diff --git a/lib/scripts/epub/epub_xhtml_utilities.py b/lib/scripts/epub/epub_xhtml_utilities.py index d46817e..3252e4a 100644 --- a/lib/scripts/epub/epub_xhtml_utilities.py +++ b/lib/scripts/epub/epub_xhtml_utilities.py @@ -66,66 +66,66 @@ def extract_language(xhtml_root): language_xhtml_class = "epub-language" language = extract_attribute(xhtml_root, language_xhtml_class) if language is None: - language = _DEFAULT_LANGUAGE - - return language + language = _DEFAULT_LANGUAGE + + return language def extract_identifier(xhtml_root): - """Extract an EPUB identifier from an XHTML file and return it. + """Extract an EPUB identifier from an XHTML file and return it. Keyword Arguments: xhtml_root -- an ElementTree Element representing the root element of a LyX document's exported XHTML file """ - identifier_xhtml_class = "epub-identifier" + identifier_xhtml_class = "epub-identifier" identifier = extract_attribute(xhtml_root, identifier_xhtml_class) if identifier is None: - identifier = _DEFAULT_IDENTIFIER - - return identifier + identifier = _DEFAULT_IDENTIFIER + + return identifier def extract_identifier_scheme(xhtml_root): - """Extract an identifier scheme from an XHTML file and return it. + """Extract an identifier scheme from an XHTML file and return it. Keyword Arguments: xhtml_root -- an ElementTree Element representing the root element of a LyX document's exported XHTML file """ - identifier_scheme_xhtml_class = "epub-identifier-scheme" + identifier_scheme_xhtml_class = "epub-identifier-scheme" identifier_scheme = extract_attribute(xhtml_root, identifier_scheme_xhtml_class) if identifier_scheme is None: - identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME - - return identifier_scheme + identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME + + return identifier_scheme def extract_author(xhtml_root): - """Extract an EPUB author (in reading order) from an XHTML file and return it. + """Extract an EPUB author (in reading order) from an XHTML file and return it. Keyword Arguments: xhtml_root -- an ElementTree Element representing the root element of a LyX document's exported XHTML file """ - author_xhtml_class = "epub-author-reading-order" + author_xhtml_class = "epub-author-reading-order" author = extract_attribute(xhtml_root, author_xhtml_class) if author is None: - author = _DEFAULT_AUTHOR - - return author + author = _DEFAULT_AUTHOR + + return author def extract_author_file_as(xhtml_root): - """Extract an EPUB author (in file-as order) from an XHTML file and return it. + """Extract an EPUB author (in file-as order) from an XHTML file and return it. Keyword Arguments: xhtml_root -- an ElementTree Element representing the root element of a LyX document's exported XHTML file """ - author_file_as_xhtml_class = "epub-author-file-as-order" + author_file_as_xhtml_class = "epub-author-file-as-order" author_file_as = extract_attribute(xhtml_root, author_file_as_xhtml_class) if author_file_as is None: - author_file_as = _DEFAULT_AUTHOR_FILE_AS - - return author_file_as + author_file_as = _DEFAULT_AUTHOR_FILE_AS + + return author_file_as commit f9ae3cadb6c1664ea47d5ba43843d65b95f0602d Author: Josh Hieronymus <[email protected]> Date: Wed Sep 11 12:41:46 2013 -0400 Implement more functions to extract EPUB metadata from XHTML. diff --git a/lib/scripts/epub/epub_xhtml_utilities.py b/lib/scripts/epub/epub_xhtml_utilities.py index 992b0cc..d46817e 100644 --- a/lib/scripts/epub/epub_xhtml_utilities.py +++ b/lib/scripts/epub/epub_xhtml_utilities.py @@ -13,21 +13,119 @@ import xml.etree.ElementTree as ET # XML namespace used in the exported XHTML file _XHTML_XMLNS = "http://www.w3.org/1999/xhtml" -# a title to be returned when the document's title cannot be extracted +# metadata values to be used when the document's metadata cannot be extracted _DEFAULT_TITLE = "A LyX-Created EPUB Book" +_DEFAULT_LANGUAGE = "en" +_DEFAULT_IDENTIFIER = "123456789X" +_DEFAULT_IDENTIFIER_SCHEME = "ISBN" +_DEFAULT_AUTHOR = "LyX User" +_DEFAULT_AUTHOR_FILE_AS = "User, LyX" + +def extract_attribute(xhtml_root, attribute_xhtml_class): + """Extract an EPUB metadata attribute from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + attribute_xhtml_class -- the value of the class attribute of the XHTML element + containing the attribute + + """ + attribute_container_xpath_schema = ".//{{{0}}}div[@class='epub-metadata {1}']/{{{0}}}a" + attribute_container_xpath = attribute_container_xpath_schema.format(_XHTML_XMLNS, attribute_xhtml_class) + attribute_container = xhtml_root.find(attribute_container_xpath) + if attribute_container is not None: + return attribute_container.tail + + return None def extract_title(xhtml_root): - """Extract a LyX document's title from its exported XHTML file and return it. + """Extract an EPUB title from an XHTML file and return it. Keyword Arguments: xhtml_root -- an ElementTree Element representing the root element of a LyX document's exported XHTML file """ - title = _DEFAULT_TITLE - title_container_xpath_schema = ".//{{{0}}}h1[@class='title']/{{{0}}}a" - title_container_xpath = title_container_xpath_schema.format(_XHTML_XMLNS) - title_container = xhtml_root.find(title_container_xpath) - if title_container is not None: - title = title_container.tail + title_xhtml_class = "epub-title" + title = extract_attribute(xhtml_root, title_xhtml_class) + if title is None: + title = _DEFAULT_TITLE + return title + +def extract_language(xhtml_root): + """Extract an EPUB language from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + """ + language_xhtml_class = "epub-language" + language = extract_attribute(xhtml_root, language_xhtml_class) + if language is None: + language = _DEFAULT_LANGUAGE + + return language + +def extract_identifier(xhtml_root): + """Extract an EPUB identifier from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + """ + identifier_xhtml_class = "epub-identifier" + identifier = extract_attribute(xhtml_root, identifier_xhtml_class) + if identifier is None: + identifier = _DEFAULT_IDENTIFIER + + return identifier + +def extract_identifier_scheme(xhtml_root): + """Extract an identifier scheme from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + """ + identifier_scheme_xhtml_class = "epub-identifier-scheme" + identifier_scheme = extract_attribute(xhtml_root, identifier_scheme_xhtml_class) + if identifier_scheme is None: + identifier_scheme = _DEFAULT_IDENTIFIER_SCHEME + + return identifier_scheme + +def extract_author(xhtml_root): + """Extract an EPUB author (in reading order) from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + """ + author_xhtml_class = "epub-author-reading-order" + author = extract_attribute(xhtml_root, author_xhtml_class) + if author is None: + author = _DEFAULT_AUTHOR + + return author + +def extract_author_file_as(xhtml_root): + """Extract an EPUB author (in file-as order) from an XHTML file and return it. + + Keyword Arguments: + xhtml_root -- an ElementTree Element representing the root element of a LyX + document's exported XHTML file + + """ + author_file_as_xhtml_class = "epub-author-file-as-order" + author_file_as = extract_attribute(xhtml_root, author_file_as_xhtml_class) + if author_file_as is None: + author_file_as = _DEFAULT_AUTHOR_FILE_AS + + return author_file_as commit 5b58881215de9a0bcafbc23216815003e99f78b0 Author: Josh Hieronymus <[email protected]> Date: Wed Sep 11 00:07:39 2013 -0400 Re-add accidentally removed instances of 'parse' in comments. diff --git a/lib/scripts/epub/epub_oebps.py b/lib/scripts/epub/epub_oebps.py index a9296f0..91e5bb5 100644 --- a/lib/scripts/epub/epub_oebps.py +++ b/lib/scripts/epub/epub_oebps.py @@ -279,7 +279,7 @@ class Toc(object): MAX_PAGE_NUMBER = "0" # only used with navigable pages def __init__(self, xhtml_root): self.root = ET.Element("head") - # should xhtml_root for uid + # should parse xhtml_root for uid # uid needs to match opf identifier uid = "123456789X" self.add_meta("dtb:uid", uid) @@ -307,7 +307,7 @@ class Toc(object): def __init__(self, xhtml_root): self.root = ET.Element("docAuthor") text = ET.Element("text") - # should xhtml_root for author + # should parse xhtml_root for author # author should match opf author # author should appear in file-as format author = "User, LyX" commit cbafb4d466bd741b836d7bb59b8de48b7aa6d754 Author: Josh Hieronymus <[email protected]> Date: Tue Sep 10 23:46:52 2013 -0400 Add more paragraph styles to EPUB Metadata module. diff --git a/lib/layouts/epub-metadata.module b/lib/layouts/epub-metadata.module index ed0c59b..3b75b22 100644 --- a/lib/layouts/epub-metadata.module +++ b/lib/layouts/epub-metadata.module @@ -24,11 +24,46 @@ Style "EPUB Metadata" HTMLAttr "class='epub-metadata'" End -Style "EPUB Author" +Style "EPUB Title" CopyStyle "EPUB Metadata" - LatexName EPUB-Author - LabelString "EPUB Author:" - HTMLAttr "class='epub-metadata epub-author'" + LatexName EPUB-Title + LabelString "EPUB Title:" + HTMLAttr "class='epub-metadata epub-title'" +End + +Style "EPUB Language" + CopyStyle "EPUB Metadata" + LatexName EPUB-Language + LabelString "EPUB Language:" + HTMLAttr "class='epub-metadata epub-language'" +End + +Style "EPUB Identifier" + CopyStyle "EPUB Metadata" + LatexName EPUB-Identifier + LabelString "EPUB Identifier:" + HTMLAttr "class='epub-metadata epub-identifier'" +End + +Style "EPUB Identifier Scheme" + CopyStyle "EPUB Metadata" + LatexName EPUB-Identifier-Scheme + LabelString "EPUB Identifier Scheme:" + HTMLAttr "class='epub-metadata epub-identifier-scheme'" +End + +Style "EPUB Author (reading order)" + CopyStyle "EPUB Metadata" + LatexName EPUB-Author-reading-order + LabelString "EPUB Author (reading order):" + HTMLAttr "class='epub-metadata epub-author-reading-order'" +End + +Style "EPUB Author (file-as order)" + CopyStyle "EPUB Metadata" + LatexName EPUB-Author-file-as-order + LabelString "EPUB Author (file-as order):" + HTMLAttr "class='epub-metadata epub-author-file-as-order'" End NoStyle "EPUB Metadata" ----------------------------------------------------------------------- Summary of changes: lib/layouts/epub-metadata.module | 43 ++++++++++- lib/scripts/epub/epub_oebps.py | 38 +++++------ lib/scripts/epub/epub_xhtml_utilities.py | 114 +++++++++++++++++++++++++++-- 3 files changed, 162 insertions(+), 33 deletions(-) hooks/post-receive -- Repositories for GSOC work
