Tag: cws_src680_dba205b User: oj Date: 2006/08/11 01:17:09 Modified: dba/dbaccess/source/ui/misc/HtmlReader.cxx dba/dbaccess/source/ui/inc/HtmlReader.hxx
Log: #i46408# set correct encoding File Changes: Directory: /dba/dbaccess/source/ui/misc/ ======================================== File [changed]: HtmlReader.cxx Url: http://dba.openoffice.org/source/browse/dba/dbaccess/source/ui/misc/HtmlReader.cxx?r1=1.25&r2=1.25.30.1 Delta lines: +35 -75 --------------------- --- HtmlReader.cxx 20 Jun 2006 03:20:01 -0000 1.25 +++ HtmlReader.cxx 11 Aug 2006 08:17:06 -0000 1.25.30.1 @@ -4,9 +4,9 @@ * * $RCSfile: HtmlReader.cxx,v $ * - * $Revision: 1.25 $ + * $Revision: 1.25.30.1 $ * - * last change: $Author: hr $ $Date: 2006/06/20 03:20:01 $ + * last change: $Author: oj $ $Date: 2006/08/11 08:17:06 $ * * The Contents of this file are made available subject to * the terms of GNU Lesser General Public License Version 2.1. @@ -50,9 +50,18 @@ #ifndef DBACCESS_SHARED_DBUSTRINGS_HRC #include "dbustrings.hrc" #endif +#ifndef _SFXDOCINF_HXX +#include <sfx2/docinf.hxx> +#endif +#ifndef _SFXHTML_HXX +#include <sfx2/sfxhtml.hxx> +#endif #ifndef _TOOLS_DEBUG_HXX #include <tools/debug.hxx> #endif +#ifndef _TOOLS_TENCCVT_HXX +#include <tools/tenccvt.hxx> +#endif #ifndef _DBAUI_MODULE_DBU_HXX_ #include "moduledbu.hxx" #endif @@ -197,12 +206,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::OHTMLReader(SvStream& rIn, @@ -221,12 +227,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::~OHTMLReader() @@ -244,33 +247,12 @@ return m_bFoundTable ? eParseState : SVPAR_ERROR; } // ----------------------------------------------------------------------------- -rtl_TextEncoding OHTMLReader::GetEncodingByMIME( const String& rMime ) -{ - DBG_CHKTHIS(OHTMLReader,NULL); - ByteString sType; - ByteString sSubType; - INetContentTypeParameterList aParameters; - ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); - if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) - { - const INetContentTypeParameter * pCharset - = aParameters.find("charset"); - if (pCharset != 0) - { - ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); - return rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ); - } - } - return RTL_TEXTENCODING_DONTKNOW; -} - -// --------------------------------------------------------------------------- void OHTMLReader::NextToken( int nToken ) { DBG_CHKTHIS(OHTMLReader,NULL); if(m_bError || !m_nRows) // falls Fehler oder keine Rows mehr zur "Uberpr"ufung dann gleich zur"uck return; - if(!m_bMetaOptions) + if ( nToken == HTML_META ) setTextEncoding(); if(m_xConnection.is()) // gibt an welcher CTOR gerufen wurde und damit, ob eine Tabelle erstellt werden soll @@ -668,43 +650,21 @@ DBG_CHKTHIS(OHTMLReader,NULL); m_bMetaOptions = sal_True; USHORT nContentOption = HTML_O_CONTENT; - String aName, aContent; - USHORT nAction = HTML_META_NONE; - BOOL bHTTPEquiv = FALSE; - const HTMLOptions *pHtmlOptions = GetOptions(&nContentOption); - for( USHORT i = pHtmlOptions->Count(); i; ) - { - const HTMLOption *pOption = (*pHtmlOptions)[ --i ]; - switch( pOption->GetToken() ) - { - case HTML_O_HTTPEQUIV: - aName = pOption->GetString(); - pOption->GetEnum( nAction, getOptions() ); - bHTTPEquiv = TRUE; - break; - case HTML_O_CONTENT: - aContent = pOption->GetString(); - break; - } - } - if( bHTTPEquiv || HTML_META_DESCRIPTION!=nAction ) - { - // wenn's keine Description ist CRs und LFs aus dem CONTENT entfernen - aContent.EraseAllChars( _CR ); - aContent.EraseAllChars( _LF ); - } - else - { - // fuer die Beschreibung die Zeilen-Umbrueche entsprechen wandeln - aContent.ConvertLineEnd(); - } - switch( nAction ) - { - case HTML_META_CONTENT_TYPE: - if( aContent.Len() ) - SetSrcEncoding(GetEncodingByMIME( aContent )); - break; - } + rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; + USHORT nMetaTags = 0; + + ::std::auto_ptr<SfxDocumentInfo> pInfo(new SfxDocumentInfo()); + BOOL bRet = SfxHTMLParser::ParseMetaOptions( pInfo.get(), NULL, + GetOptions(&nContentOption), + nMetaTags, eEnc ); + + // If the encoding is set by a META tag, it may only overwrite the + // current encoding if both, the current and the new encoding, are 1-BYTE + // encodings. Everything else cannot lead to reasonable results. + if( RTL_TEXTENCODING_DONTKNOW != eEnc && + rtl_isOctetTextEncoding( eEnc ) && + rtl_isOctetTextEncoding( GetSrcEncoding() ) ) + SetSrcEncoding( eEnc ); } // ----------------------------------------------------------------------------- void OHTMLReader::release() Directory: /dba/dbaccess/source/ui/inc/ ======================================= File [changed]: HtmlReader.hxx Url: http://dba.openoffice.org/source/browse/dba/dbaccess/source/ui/inc/HtmlReader.hxx?r1=1.12&r2=1.12.34.1 Delta lines: +4 -5 ------------------- --- HtmlReader.hxx 20 Jun 2006 03:11:46 -0000 1.12 +++ HtmlReader.hxx 11 Aug 2006 08:17:07 -0000 1.12.34.1 @@ -4,9 +4,9 @@ * * $RCSfile: HtmlReader.hxx,v $ * - * $Revision: 1.12 $ + * $Revision: 1.12.34.1 $ * - * last change: $Author: hr $ $Date: 2006/06/20 03:11:46 $ + * last change: $Author: oj $ $Date: 2006/08/11 08:17:07 $ * * The Contents of this file are made available subject to * the terms of GNU Lesser General Public License Version 2.1. @@ -76,10 +76,9 @@ void TableDataOn(SvxCellHorJustify& eVal,int nToken); void TableFontOn(::com::sun::star::awt::FontDescriptor& _rFont,sal_Int32 &_rTextColor); sal_Int16 GetWidthPixel( const HTMLOption* pOption ); - rtl_TextEncoding GetEncodingByMIME( const String& rMime ); void setTextEncoding(); void fetchOptions(); - ~OHTMLReader(); + virtual ~OHTMLReader(); public: OHTMLReader(SvStream& rIn, const SharedConnection& _rxConnection, --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
