Fridrich Strba wrote:
Even the most ugly code that has some desired functionality is worth
showing. IMHO, technical discussions around an existing, though
imperfect, code are really useful for one's growth ;-) And I know what I
am saying when I speak about imperfect code from my own hacking
experience :-)
Then here's the proposed ugly code. So far, not much to look at. This code was just an experiment.


Andrew
/**
 * Copyright (C) 2006 Andrew Ziem.
 * Copyright (C) 2004, 2005 William Lachance ([EMAIL PROTECTED])
 *       (some parts copied from libwpd 0.8.6)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
 *
 *
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <inttypes.h>
#include <errno.h>
#include <iconv.h>

#include <gsf/gsf-utils.h>

#include <gsf/gsf-input-stdio.h>
#include <gsf/gsf-infile.h>
#include <gsf/gsf-infile-msole.h>




//fixme
#define WPD_LE_GET_GUINT8(p) (*(uint8_t const *)(p))
#define WPD_LE_GET_GUINT16(p)				  \
        (uint16_t)((((uint8_t const *)(p))[0] << 0)  |    \
                  (((uint8_t const *)(p))[1] << 8))
#define WPD_LE_GET_GUINT32(p) \
        (uint32_t)((((uint8_t const *)(p))[0] << 0)  |    \
                  (((uint8_t const *)(p))[1] << 8)  |    \
                  (((uint8_t const *)(p))[2] << 16) |    \
                  (((uint8_t const *)(p))[3] << 24))

#define WPD_BE_GET_GUINT8(p) (*(uint8_t const *)(p))
#define WPD_BE_GET_GUINT16(p)                           \
        (uint16_t)((((uint8_t const *)(p))[1] << 0)  |    \
                  (((uint8_t const *)(p))[0] << 8))
#define WPD_BE_GET_GUINT32(p)                           \
        (uint32_t)((((uint8_t const *)(p))[3] << 0)  |    \
                  (((uint8_t const *)(p))[2] << 8)  |    \
                  (((uint8_t const *)(p))[1] << 16) |    \
                  (((uint8_t const *)(p))[0] << 24))

#define DELETEP(m) if (m) { delete m; m = NULL; }

#define WPS_VERSION_UNKNOWN 0
#define WPS_VERSION_4 1
#define WPS_VERSION_2000 2
#define WPS_VERSION_8 3

#define WPS8_HEADER_MAGIC_OFFSET 0

class FileException
{
	// needless to say, we could flesh this class out a bit
};


class GSFInputStream
{
public:
	GSFInputStream(GsfInput *input);
	virtual ~GSFInputStream();
	
	const uint8_t *read(size_t numBytes, size_t &numBytesRead);
	int seek(long offset, GSeekType seekType);
	const long tell();
	bool atEOS();
	std::string readCPPString(size_t numBytes);
	char * readCString(size_t numBytes);	
	bool hasChildByName(char const *name);
	int getWorksVersion();
	GSFInputStream * getDocumentOLEStream();
	
	int8_t read8(); 
	uint8_t readU8(); 
	uint16_t readU16(bool bigendian=false);
	uint32_t readU32(bool bigendian=false);
	
private:
	GsfInput *m_input;
	GsfInfile *m_ole;
	int wversion;
};

GSFInputStream::GSFInputStream(GsfInput *input)
{
	m_input = input;
	m_ole = NULL;
	wversion = WPS_VERSION_UNKNOWN;
	g_object_ref(G_OBJECT(input));
}

GSFInputStream::~GSFInputStream()
{
	if (m_ole)
		g_object_unref(G_OBJECT(m_ole));

	g_object_unref(G_OBJECT(m_input));
}

const uint8_t * GSFInputStream::read(size_t numBytes, size_t &numBytesRead)
{
	const uint8_t *buf = gsf_input_read(m_input, numBytes, NULL);

	if (buf == NULL)
		numBytesRead = 0;
	else
		numBytesRead = numBytes;

	return buf;
}

int GSFInputStream::seek(long offset, GSeekType seekType) 
{
	return gsf_input_seek(m_input, offset, seekType);
}

const long GSFInputStream::tell()
{
	return gsf_input_tell(m_input);
}

bool GSFInputStream::atEOS()
{
	return gsf_input_eof(m_input);
}


bool GSFInputStream::hasChildByName(char const *name)
{
	if (!m_ole)
		m_ole = GSF_INFILE(gsf_infile_msole_new (m_input, NULL)); 

	if (m_ole)
	{
		GsfInput *g = gsf_infile_child_by_name (m_ole, name);
	
		if (g)
		{
			g_object_unref(G_OBJECT (g));	
			return true;
		}
	}
	return false;
}

void gsf_dump_children_by_name(GsfInfile *infile)
{
	if (!infile)
		return;
		
	int num_children = gsf_infile_num_children(infile);	
	
	for (int i = 0; i < num_children; i++)
	{
		GsfInput *g = gsf_infile_child_by_index(infile, i);
		if (g)
		{
			printf("child: %s\n", gsf_input_name(g));
			g_object_unref(G_OBJECT (g));			
		}
		else
		{
			printf("error getting child %i\n", i);
		}
	}
}

int GSFInputStream::getWorksVersion()
{
	//fixme: a lot of guessing right now
	int hasCompObj, hasMM, hasMNO, hasMatOST, hasCONTENTS, hasSPELLING;
	bool hasWorks8Magic = false, hasWorks2000Magic = false;

	const char CompObj[] = {0x01,'C','o','m','p','O','b','j',0};
	
	hasCompObj = hasChildByName(CompObj);
	hasMM = hasChildByName("MM");	
	hasMNO = hasChildByName("MN0");
	hasMatOST = hasChildByName("MatOST");
	hasCONTENTS = hasChildByName("CONTENTS");
	hasSPELLING = hasChildByName("SPELLING");
		
	if (m_ole)
	{	
		if (hasCONTENTS)
		{
			char fileMagic[8];
			GSFInputStream *documentStream = NULL;		
			GsfInput * document = gsf_infile_child_by_name(m_ole, "CONTENTS");
		
			if (document) 
			{
				documentStream = new GSFInputStream(document);
				g_object_unref(G_OBJECT (document));			
			
				/* check the Works 2000, 8 format magics */
				documentStream->seek(WPS8_HEADER_MAGIC_OFFSET, G_SEEK_SET);
				for (int i=0; i<7 && !documentStream->atEOS(); i++)
					fileMagic[i] = documentStream->readU8();
				fileMagic[7] = '\0';
	
				// works8 have CHNKWKS -- maybe works7
				hasWorks8Magic = (0 == strcmp(fileMagic, "CHNKWKS"));
				hasWorks2000Magic = (0 == strcmp(fileMagic, "CHNKINK"));
				if (!hasWorks8Magic && !hasWorks2000Magic)
				{
					//todo: debug message
				}
				DELETEP(documentStream);
			}
			
		
			
		}


		int num_children = gsf_infile_num_children(m_ole);		

		// some old Works has same as v4 and {0x05} "SummaryInformation" and {0x05} DocumentSummaryInformation
		// old Works, maybe v4, has CompObj, MM, MNO 
		// Works v8, maybe also 7, has CompObj, CONTENTS, SPELLING
		
		//todo: examine CompObj for version info
		//     comparing first 68 bytes of many CompObj
		//          cat ?CompObj | tr -d "\n" | cut -b 1-68 | md5sum -b
		//             most have a55af8a258d4a125569a58b126c38e9f
		
		if (num_children > 3)
		{
			printf("num_children=%i\n", num_children);
			gsf_dump_children_by_name(m_ole);				
		}
	
		if (!hasWorks2000Magic && !hasWorks8Magic && hasMM && hasMNO && hasMatOST && !hasCONTENTS && !hasSPELLING)
		{
			wversion = WPS_VERSION_4;
			return wversion;
		}
		
		if (hasWorks2000Magic && !hasWorks8Magic && !hasMM && !hasMNO && !hasMatOST && hasCONTENTS)
		{
			wversion = WPS_VERSION_2000;
			return wversion;
		}		
		
		if (!hasWorks2000Magic && hasWorks8Magic && hasCompObj && !hasMM && !hasMNO && !hasMatOST && hasCONTENTS && hasSPELLING && 3==num_children)
		{
			wversion = WPS_VERSION_8;
			return wversion;
		}
		
		if (num_children <= 3)
		{
			gsf_dump_children_by_name(m_ole);				
		}
	}
		
	wversion = WPS_VERSION_UNKNOWN;
	return wversion;
	
}


GSFInputStream * GSFInputStream::getDocumentOLEStream()
{
	GSFInputStream *documentStream = NULL;

	if (!m_ole)
		m_ole = GSF_INFILE(gsf_infile_msole_new (m_input, NULL)); 

	if (m_ole)
	{
		GsfInput * document = NULL;
		switch (wversion)
		{
		
			case WPS_VERSION_4:
				document = gsf_infile_child_by_name(m_ole, "MN0");
				break;
			case WPS_VERSION_2000:				
			case WPS_VERSION_8:
				document = gsf_infile_child_by_name(m_ole, "CONTENTS");
				break;
		}
		if (document) 
		{
			documentStream = new GSFInputStream(document);
			g_object_unref(G_OBJECT (document));			
		}
	}

	return documentStream;	
}



uint8_t GSFInputStream::readU8()
{
	size_t numBytesRead;
	uint8_t const * p = read(sizeof(uint8_t), numBytesRead);
	
  	if (!p || numBytesRead != sizeof(uint8_t))
  		throw FileException();

	return WPD_LE_GET_GUINT8(p);
}

int8_t GSFInputStream::read8()
{
	size_t numBytesRead;
	int8_t const * p = (int8_t const *) read(sizeof(int8_t), numBytesRead);

  	if (!p || numBytesRead != sizeof(int8_t))
  		throw FileException();

	return (int8_t)*(p);
}

uint16_t GSFInputStream::readU16(bool bigendian)
{
	size_t numBytesRead;
	uint16_t const *val = (uint16_t const *) read(sizeof(uint16_t), numBytesRead);

	if (!val || numBytesRead != sizeof(uint16_t))
  		throw FileException();

	if (bigendian)
		return WPD_BE_GET_GUINT16(val);
	return WPD_LE_GET_GUINT16(val);
}

uint32_t GSFInputStream::readU32(bool bigendian)
{
	size_t numBytesRead;
	uint32_t const *val = (uint32_t const *) read(sizeof(uint32_t), numBytesRead);

	if (!val || numBytesRead != sizeof(uint32_t))
  		throw FileException();

	if (bigendian)
		return WPD_BE_GET_GUINT32(val);
	return WPD_LE_GET_GUINT32(val);
}

std::string GSFInputStream::readCPPString(size_t numBytes)
{
	std::string s;
	char c;
	for (size_t i = 0; i < numBytes; i++)
	{
		c = (char)readU8();
		s += c;
	}
	
	return s;
}


char * GSFInputStream::readCString(size_t numBytes)
{
	char * s = (char *)malloc(numBytes + 2);
	if (NULL == s)
	{
		perror("malloc");
		return NULL;
	}
	for (size_t i = 0; i < numBytes; i++)
	{
		s[i] = (char)readU8();
	}
	
	
	return s;
}



static void dump_wps8(GSFInputStream *document)
{

	// version number not at 21
	document->seek(21, G_SEEK_SET);
	int v = document->readU8();
	printf("vers ? = %xh (%i)\n", v, v);

	// find first TEXT 
	document->seek(34 - document->tell(), G_SEEK_CUR);	
	std::string header = document->readCPPString(4);
	printf("header = %s\n", header.c_str());
	// + 6 bytes data
	document->seek(6, G_SEEK_CUR);	
	// find next TEXT
	header = document->readCPPString(4);
	printf("header = %s\n", header.c_str());
	document->seek(4, G_SEEK_CUR);	
	size_t text_length  = document->readU32();
	text_length -= 2;
	
	if (text_length < 2)
	{
		printf("no text!\n");
		return;
	}

	printf("text length = %i (%xh)\n", text_length, text_length);
	
	// read the text contents
	document->seek(0x200, G_SEEK_SET);	
	char * text = document->readCString(text_length);

	text[text_length-2]=text[text_length-1]=0;
	
//	printf("text = %02x,%02x,%02x,%02x\n", text[0], text[1], text[2],text[3]);
//	printf("text = %02x,%02x,%02x,%02x\n", text[4], text[5], text[6],text[7]);	

	iconv_t cd; // conversion descriptor
	cd = iconv_open("UTF-8", "UTF-16LE"); //guessing
	if ((iconv_t)-1 == cd)
	{
		g_error("iconv_open() failed\n");
		return;
	}
	size_t outbytesleft =(text_length*2);	 //fixme: size
	printf("outbytesleft starts = %i\n", outbytesleft);
	char *outbuffer = (char *)malloc(outbytesleft+1);
	if (NULL == outbuffer)
	{
		perror("malloc");
		return;
	}
	char *source = text;
	char *result = outbuffer;
	size_t rc = iconv(cd, &text, &text_length, &outbuffer, &outbytesleft);
	if ((size_t)-1 == rc)
	{
		g_error("iconv() failed, errno=%i\n", errno);
		return;
	}
	iconv_close(cd);

	// change end of line character
	int x = strlen(result);
	for (;x>=0;x--)
	{
		if (0x0D==result[x])
			result[x]=0x0A;
		
	}
	printf("result = %s\n", result);
//	printf("result = %02x,%02x,%02x,%02x\n", result[0], result[1], result[2],result[3]);
	free(source);	
	free(result);
	

}

static void dump_wps4(GSFInputStream *document)
{
	// get text length
	// works4: offset for text_length is at 0x26
	// checked lengths 1, 2, FF, FFFF
	document->seek(0x26, G_SEEK_SET);	
	size_t text_length  = document->readU32();
	text_length -= (256+4);
	printf("text_length = %i\n", text_length);

	// read actual text
	// works4: offset for text start is 0x102	
	document->seek(0x102, G_SEEK_SET);	
	char * text = document->readCString(text_length);
	if (text)
	{
		printf("text = %s\n", text);
		free(text);
	}
}

static void
dump_wps (GSFInputStream *istream)
{
	int wversion = istream->getWorksVersion();

	if (WPS_VERSION_UNKNOWN == wversion)
	{
		printf("Unknown Works version\n");						
		return;		
	}
	
	GSFInputStream *document = istream->getDocumentOLEStream();
	if (!document)
	{
		g_error ("Input stream failed");
		return;
	}	
	switch (wversion)
	{
		case WPS_VERSION_4:
			printf("Works version 4 format\n");
			dump_wps4(document);			
			break;
			
		case WPS_VERSION_2000:
			printf("Works version 2000 (v5) format\n");			
			break;			
			
		case WPS_VERSION_8:
			printf("Works version 8 (Suite 2005) format\n");			
			dump_wps8(document);
			break;
	}
	
	DELETEP(document);
}


int main(int argc, char **argv)
{
	GsfInput *input;
	GError *err = NULL;
	char *fn;
	
	if (argc < 2)
	{
		g_error("put filename on command line");
	}
	
	fn = argv[1];

	gsf_init ();
	printf("\ndebug: opening %s\n", fn);
	input = gsf_input_stdio_new(fn, &err);
	if (NULL == input)
	{
		g_return_val_if_fail (err != NULL, 1);
		g_warning ("'%s' error: %s", fn, err->message);
		g_error_free (err);
		return 1;
	}
	
	
	GSFInputStream istream(input);
	
	dump_wps(&istream);
	
//	infile = gsf_infile_msole_new (input, &err);
//	g_object_unref (G_OBJECT (input));
	
	gsf_shutdown ();	
	return 0;
}
all: wps_test

wps_test: wps_test.cpp
        g++ -o wps_test wps_test.cpp `pkg-config glib-2.0 --cflags` 
-I/usr/include/libgsf-1 -L/usrlib -g -Wall -lgsf-1 
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
Libwpd-devel mailing list
Libwpd-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/libwpd-devel

Reply via email to