Fridrich Strba wrote:
Then here's the proposed ugly code. So far, not much to look at. This code was just an experiment.Even the most ugly code that has some desired functionality is worth showing. IMHO, technical discussions around an existing, though imperfect, code are really useful for one's growth ;-) And I know what I am saying when I speak about imperfect code from my own hacking experience :-)
Andrew
/** * Copyright (C) 2006 Andrew Ziem. * Copyright (C) 2004, 2005 William Lachance ([EMAIL PROTECTED]) * (some parts copied from libwpd 0.8.6) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA * * */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <string> #include <inttypes.h> #include <errno.h> #include <iconv.h> #include <gsf/gsf-utils.h> #include <gsf/gsf-input-stdio.h> #include <gsf/gsf-infile.h> #include <gsf/gsf-infile-msole.h> //fixme #define WPD_LE_GET_GUINT8(p) (*(uint8_t const *)(p)) #define WPD_LE_GET_GUINT16(p) \ (uint16_t)((((uint8_t const *)(p))[0] << 0) | \ (((uint8_t const *)(p))[1] << 8)) #define WPD_LE_GET_GUINT32(p) \ (uint32_t)((((uint8_t const *)(p))[0] << 0) | \ (((uint8_t const *)(p))[1] << 8) | \ (((uint8_t const *)(p))[2] << 16) | \ (((uint8_t const *)(p))[3] << 24)) #define WPD_BE_GET_GUINT8(p) (*(uint8_t const *)(p)) #define WPD_BE_GET_GUINT16(p) \ (uint16_t)((((uint8_t const *)(p))[1] << 0) | \ (((uint8_t const *)(p))[0] << 8)) #define WPD_BE_GET_GUINT32(p) \ (uint32_t)((((uint8_t const *)(p))[3] << 0) | \ (((uint8_t const *)(p))[2] << 8) | \ (((uint8_t const *)(p))[1] << 16) | \ (((uint8_t const *)(p))[0] << 24)) #define DELETEP(m) if (m) { delete m; m = NULL; } #define WPS_VERSION_UNKNOWN 0 #define WPS_VERSION_4 1 #define WPS_VERSION_2000 2 #define WPS_VERSION_8 3 #define WPS8_HEADER_MAGIC_OFFSET 0 class FileException { // needless to say, we could flesh this class out a bit }; class GSFInputStream { public: GSFInputStream(GsfInput *input); virtual ~GSFInputStream(); const uint8_t *read(size_t numBytes, size_t &numBytesRead); int seek(long offset, GSeekType seekType); const long tell(); bool atEOS(); std::string readCPPString(size_t numBytes); char * readCString(size_t numBytes); bool hasChildByName(char const *name); int getWorksVersion(); GSFInputStream * getDocumentOLEStream(); int8_t read8(); uint8_t readU8(); uint16_t readU16(bool bigendian=false); uint32_t readU32(bool bigendian=false); private: GsfInput *m_input; GsfInfile *m_ole; int wversion; }; GSFInputStream::GSFInputStream(GsfInput *input) { m_input = input; m_ole = NULL; wversion = WPS_VERSION_UNKNOWN; g_object_ref(G_OBJECT(input)); } GSFInputStream::~GSFInputStream() { if (m_ole) g_object_unref(G_OBJECT(m_ole)); g_object_unref(G_OBJECT(m_input)); } const uint8_t * GSFInputStream::read(size_t numBytes, size_t &numBytesRead) { const uint8_t *buf = gsf_input_read(m_input, numBytes, NULL); if (buf == NULL) numBytesRead = 0; else numBytesRead = numBytes; return buf; } int GSFInputStream::seek(long offset, GSeekType seekType) { return gsf_input_seek(m_input, offset, seekType); } const long GSFInputStream::tell() { return gsf_input_tell(m_input); } bool GSFInputStream::atEOS() { return gsf_input_eof(m_input); } bool GSFInputStream::hasChildByName(char const *name) { if (!m_ole) m_ole = GSF_INFILE(gsf_infile_msole_new (m_input, NULL)); if (m_ole) { GsfInput *g = gsf_infile_child_by_name (m_ole, name); if (g) { g_object_unref(G_OBJECT (g)); return true; } } return false; } void gsf_dump_children_by_name(GsfInfile *infile) { if (!infile) return; int num_children = gsf_infile_num_children(infile); for (int i = 0; i < num_children; i++) { GsfInput *g = gsf_infile_child_by_index(infile, i); if (g) { printf("child: %s\n", gsf_input_name(g)); g_object_unref(G_OBJECT (g)); } else { printf("error getting child %i\n", i); } } } int GSFInputStream::getWorksVersion() { //fixme: a lot of guessing right now int hasCompObj, hasMM, hasMNO, hasMatOST, hasCONTENTS, hasSPELLING; bool hasWorks8Magic = false, hasWorks2000Magic = false; const char CompObj[] = {0x01,'C','o','m','p','O','b','j',0}; hasCompObj = hasChildByName(CompObj); hasMM = hasChildByName("MM"); hasMNO = hasChildByName("MN0"); hasMatOST = hasChildByName("MatOST"); hasCONTENTS = hasChildByName("CONTENTS"); hasSPELLING = hasChildByName("SPELLING"); if (m_ole) { if (hasCONTENTS) { char fileMagic[8]; GSFInputStream *documentStream = NULL; GsfInput * document = gsf_infile_child_by_name(m_ole, "CONTENTS"); if (document) { documentStream = new GSFInputStream(document); g_object_unref(G_OBJECT (document)); /* check the Works 2000, 8 format magics */ documentStream->seek(WPS8_HEADER_MAGIC_OFFSET, G_SEEK_SET); for (int i=0; i<7 && !documentStream->atEOS(); i++) fileMagic[i] = documentStream->readU8(); fileMagic[7] = '\0'; // works8 have CHNKWKS -- maybe works7 hasWorks8Magic = (0 == strcmp(fileMagic, "CHNKWKS")); hasWorks2000Magic = (0 == strcmp(fileMagic, "CHNKINK")); if (!hasWorks8Magic && !hasWorks2000Magic) { //todo: debug message } DELETEP(documentStream); } } int num_children = gsf_infile_num_children(m_ole); // some old Works has same as v4 and {0x05} "SummaryInformation" and {0x05} DocumentSummaryInformation // old Works, maybe v4, has CompObj, MM, MNO // Works v8, maybe also 7, has CompObj, CONTENTS, SPELLING //todo: examine CompObj for version info // comparing first 68 bytes of many CompObj // cat ?CompObj | tr -d "\n" | cut -b 1-68 | md5sum -b // most have a55af8a258d4a125569a58b126c38e9f if (num_children > 3) { printf("num_children=%i\n", num_children); gsf_dump_children_by_name(m_ole); } if (!hasWorks2000Magic && !hasWorks8Magic && hasMM && hasMNO && hasMatOST && !hasCONTENTS && !hasSPELLING) { wversion = WPS_VERSION_4; return wversion; } if (hasWorks2000Magic && !hasWorks8Magic && !hasMM && !hasMNO && !hasMatOST && hasCONTENTS) { wversion = WPS_VERSION_2000; return wversion; } if (!hasWorks2000Magic && hasWorks8Magic && hasCompObj && !hasMM && !hasMNO && !hasMatOST && hasCONTENTS && hasSPELLING && 3==num_children) { wversion = WPS_VERSION_8; return wversion; } if (num_children <= 3) { gsf_dump_children_by_name(m_ole); } } wversion = WPS_VERSION_UNKNOWN; return wversion; } GSFInputStream * GSFInputStream::getDocumentOLEStream() { GSFInputStream *documentStream = NULL; if (!m_ole) m_ole = GSF_INFILE(gsf_infile_msole_new (m_input, NULL)); if (m_ole) { GsfInput * document = NULL; switch (wversion) { case WPS_VERSION_4: document = gsf_infile_child_by_name(m_ole, "MN0"); break; case WPS_VERSION_2000: case WPS_VERSION_8: document = gsf_infile_child_by_name(m_ole, "CONTENTS"); break; } if (document) { documentStream = new GSFInputStream(document); g_object_unref(G_OBJECT (document)); } } return documentStream; } uint8_t GSFInputStream::readU8() { size_t numBytesRead; uint8_t const * p = read(sizeof(uint8_t), numBytesRead); if (!p || numBytesRead != sizeof(uint8_t)) throw FileException(); return WPD_LE_GET_GUINT8(p); } int8_t GSFInputStream::read8() { size_t numBytesRead; int8_t const * p = (int8_t const *) read(sizeof(int8_t), numBytesRead); if (!p || numBytesRead != sizeof(int8_t)) throw FileException(); return (int8_t)*(p); } uint16_t GSFInputStream::readU16(bool bigendian) { size_t numBytesRead; uint16_t const *val = (uint16_t const *) read(sizeof(uint16_t), numBytesRead); if (!val || numBytesRead != sizeof(uint16_t)) throw FileException(); if (bigendian) return WPD_BE_GET_GUINT16(val); return WPD_LE_GET_GUINT16(val); } uint32_t GSFInputStream::readU32(bool bigendian) { size_t numBytesRead; uint32_t const *val = (uint32_t const *) read(sizeof(uint32_t), numBytesRead); if (!val || numBytesRead != sizeof(uint32_t)) throw FileException(); if (bigendian) return WPD_BE_GET_GUINT32(val); return WPD_LE_GET_GUINT32(val); } std::string GSFInputStream::readCPPString(size_t numBytes) { std::string s; char c; for (size_t i = 0; i < numBytes; i++) { c = (char)readU8(); s += c; } return s; } char * GSFInputStream::readCString(size_t numBytes) { char * s = (char *)malloc(numBytes + 2); if (NULL == s) { perror("malloc"); return NULL; } for (size_t i = 0; i < numBytes; i++) { s[i] = (char)readU8(); } return s; } static void dump_wps8(GSFInputStream *document) { // version number not at 21 document->seek(21, G_SEEK_SET); int v = document->readU8(); printf("vers ? = %xh (%i)\n", v, v); // find first TEXT document->seek(34 - document->tell(), G_SEEK_CUR); std::string header = document->readCPPString(4); printf("header = %s\n", header.c_str()); // + 6 bytes data document->seek(6, G_SEEK_CUR); // find next TEXT header = document->readCPPString(4); printf("header = %s\n", header.c_str()); document->seek(4, G_SEEK_CUR); size_t text_length = document->readU32(); text_length -= 2; if (text_length < 2) { printf("no text!\n"); return; } printf("text length = %i (%xh)\n", text_length, text_length); // read the text contents document->seek(0x200, G_SEEK_SET); char * text = document->readCString(text_length); text[text_length-2]=text[text_length-1]=0; // printf("text = %02x,%02x,%02x,%02x\n", text[0], text[1], text[2],text[3]); // printf("text = %02x,%02x,%02x,%02x\n", text[4], text[5], text[6],text[7]); iconv_t cd; // conversion descriptor cd = iconv_open("UTF-8", "UTF-16LE"); //guessing if ((iconv_t)-1 == cd) { g_error("iconv_open() failed\n"); return; } size_t outbytesleft =(text_length*2); //fixme: size printf("outbytesleft starts = %i\n", outbytesleft); char *outbuffer = (char *)malloc(outbytesleft+1); if (NULL == outbuffer) { perror("malloc"); return; } char *source = text; char *result = outbuffer; size_t rc = iconv(cd, &text, &text_length, &outbuffer, &outbytesleft); if ((size_t)-1 == rc) { g_error("iconv() failed, errno=%i\n", errno); return; } iconv_close(cd); // change end of line character int x = strlen(result); for (;x>=0;x--) { if (0x0D==result[x]) result[x]=0x0A; } printf("result = %s\n", result); // printf("result = %02x,%02x,%02x,%02x\n", result[0], result[1], result[2],result[3]); free(source); free(result); } static void dump_wps4(GSFInputStream *document) { // get text length // works4: offset for text_length is at 0x26 // checked lengths 1, 2, FF, FFFF document->seek(0x26, G_SEEK_SET); size_t text_length = document->readU32(); text_length -= (256+4); printf("text_length = %i\n", text_length); // read actual text // works4: offset for text start is 0x102 document->seek(0x102, G_SEEK_SET); char * text = document->readCString(text_length); if (text) { printf("text = %s\n", text); free(text); } } static void dump_wps (GSFInputStream *istream) { int wversion = istream->getWorksVersion(); if (WPS_VERSION_UNKNOWN == wversion) { printf("Unknown Works version\n"); return; } GSFInputStream *document = istream->getDocumentOLEStream(); if (!document) { g_error ("Input stream failed"); return; } switch (wversion) { case WPS_VERSION_4: printf("Works version 4 format\n"); dump_wps4(document); break; case WPS_VERSION_2000: printf("Works version 2000 (v5) format\n"); break; case WPS_VERSION_8: printf("Works version 8 (Suite 2005) format\n"); dump_wps8(document); break; } DELETEP(document); } int main(int argc, char **argv) { GsfInput *input; GError *err = NULL; char *fn; if (argc < 2) { g_error("put filename on command line"); } fn = argv[1]; gsf_init (); printf("\ndebug: opening %s\n", fn); input = gsf_input_stdio_new(fn, &err); if (NULL == input) { g_return_val_if_fail (err != NULL, 1); g_warning ("'%s' error: %s", fn, err->message); g_error_free (err); return 1; } GSFInputStream istream(input); dump_wps(&istream); // infile = gsf_infile_msole_new (input, &err); // g_object_unref (G_OBJECT (input)); gsf_shutdown (); return 0; }
all: wps_test wps_test: wps_test.cpp g++ -o wps_test wps_test.cpp `pkg-config glib-2.0 --cflags` -I/usr/include/libgsf-1 -L/usrlib -g -Wall -lgsf-1
------------------------------------------------------------------------- Using Tomcat but need to do more? Need to support web services, security? Get stuff done quickly with pre-integrated technology to make your job easier Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________ Libwpd-devel mailing list Libwpd-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/libwpd-devel