User: ihi Date: 2006/10/18 06:31:59 Modified: dba/dbaccess/source/ui/misc/HtmlReader.cxx
Log: INTEGRATION: CWS dba205b (1.25.30); FILE MERGED 2006/09/08 11:45:04 oj 1.25.30.3: removed unused var 2006/09/04 12:08:09 oj 1.25.30.2: RESYNC: (1.25-1.26); FILE MERGED 2006/08/11 08:17:06 oj 1.25.30.1: #i46408# set correct encoding File Changes: Directory: /dba/dbaccess/source/ui/misc/ ======================================== File [changed]: HtmlReader.cxx Url: http://dba.openoffice.org/source/browse/dba/dbaccess/source/ui/misc/HtmlReader.cxx?r1=1.27&r2=1.28 Delta lines: +34 -86 --------------------- --- HtmlReader.cxx 17 Sep 2006 07:12:40 -0000 1.27 +++ HtmlReader.cxx 18 Oct 2006 13:31:57 -0000 1.28 @@ -56,9 +56,18 @@ #ifndef DBACCESS_SHARED_DBUSTRINGS_HRC #include "dbustrings.hrc" #endif +#ifndef _SFXDOCINF_HXX +#include <sfx2/docinf.hxx> +#endif +#ifndef _SFXHTML_HXX +#include <sfx2/sfxhtml.hxx> +#endif #ifndef _TOOLS_DEBUG_HXX #include <tools/debug.hxx> #endif +#ifndef _TOOLS_TENCCVT_HXX +#include <tools/tenccvt.hxx> +#endif #ifndef _DBAUI_MODULE_DBU_HXX_ #include "moduledbu.hxx" #endif @@ -203,12 +212,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::OHTMLReader(SvStream& rIn, @@ -227,12 +233,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::~OHTMLReader() @@ -250,41 +253,18 @@ return m_bFoundTable ? eParseState : SVPAR_ERROR; } // ----------------------------------------------------------------------------- -rtl_TextEncoding OHTMLReader::GetEncodingByMIME( const String& rMime ) -{ - DBG_CHKTHIS(OHTMLReader,NULL); - ByteString sType; - ByteString sSubType; - INetContentTypeParameterList aParameters; - ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); - if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) - { - const INetContentTypeParameter * pCharset - = aParameters.find("charset"); - if (pCharset != 0) - { - ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); - return rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ); - } - } - return RTL_TEXTENCODING_DONTKNOW; -} - -// --------------------------------------------------------------------------- void OHTMLReader::NextToken( int nToken ) { DBG_CHKTHIS(OHTMLReader,NULL); if(m_bError || !m_nRows) // falls Fehler oder keine Rows mehr zur "Uberpr"ufung dann gleich zur"uck return; + if ( nToken == HTML_META ) + setTextEncoding(); if(m_xConnection.is()) // gibt an welcher CTOR gerufen wurde und damit, ob eine Tabelle erstellt werden soll { switch(nToken) { - case HTML_META: - if(!m_bMetaOptions) - setTextEncoding(); - break; case HTML_TABLE_ON: ++m_nTableCount; { // es kann auch TD oder TH sein, wenn es vorher kein TABLE gab @@ -676,54 +656,21 @@ DBG_CHKTHIS(OHTMLReader,NULL); m_bMetaOptions = sal_True; USHORT nContentOption = HTML_O_CONTENT; - String aName, aContent; - USHORT nAction = HTML_META_NONE; - BOOL bHTTPEquiv = FALSE; - const HTMLOptions *pHtmlOptions = GetOptions(&nContentOption); - for( USHORT i = pHtmlOptions->Count(); i; ) - { - const HTMLOption *pOption = (*pHtmlOptions)[ --i ]; - switch( pOption->GetToken() ) - { - case HTML_O_HTTPEQUIV: - aName = pOption->GetString(); - pOption->GetEnum( nAction, getOptions() ); - bHTTPEquiv = TRUE; - break; - case HTML_O_CONTENT: - aContent = pOption->GetString(); - break; - } - } - if( bHTTPEquiv || HTML_META_DESCRIPTION!=nAction ) - { - // wenn's keine Description ist CRs und LFs aus dem CONTENT entfernen - aContent.EraseAllChars( _CR ); - aContent.EraseAllChars( _LF ); - } - else - { - // fuer die Beschreibung die Zeilen-Umbrueche entsprechen wandeln - aContent.ConvertLineEnd(); - } - switch( nAction ) - { - case HTML_META_CONTENT_TYPE: - if( aContent.Len() ) - { - rtl_TextEncoding eEnc = GetEncodingByMIME( aContent ); + rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; + USHORT nMetaTags = 0; + + ::std::auto_ptr<SfxDocumentInfo> pInfo(new SfxDocumentInfo()); + SfxHTMLParser::ParseMetaOptions( pInfo.get(), NULL, + GetOptions(&nContentOption), + nMetaTags, eEnc ); + // If the encoding is set by a META tag, it may only overwrite the // current encoding if both, the current and the new encoding, are 1-BYTE // encodings. Everything else cannot lead to reasonable results. - if ( rtl_isOctetTextEncoding( eEnc ) && + if( RTL_TEXTENCODING_DONTKNOW != eEnc && + rtl_isOctetTextEncoding( eEnc ) && rtl_isOctetTextEncoding( GetSrcEncoding() ) ) - { - eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); SetSrcEncoding( eEnc ); - } - } - break; - } } // ----------------------------------------------------------------------------- void OHTMLReader::release() @@ -738,3 +685,4 @@ return new OWizHTMLExtend(_pParent,rInput); } // ----------------------------------------------------------------------------- + --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
