Tag: cws_src680_mingwport03 User: vg Date: 2006/11/07 06:33:31 Modified: dba/dbaccess/source/ui/misc/HtmlReader.cxx
Log: RESYNC: (1.25-1.28); FILE MERGED File Changes: Directory: /dba/dbaccess/source/ui/misc/ ======================================== File [changed]: HtmlReader.cxx Url: http://dba.openoffice.org/source/browse/dba/dbaccess/source/ui/misc/HtmlReader.cxx?r1=1.25.50.1&r2=1.25.50.2 Delta lines: +42 -75 --------------------- --- HtmlReader.cxx 7 Sep 2006 10:36:14 -0000 1.25.50.1 +++ HtmlReader.cxx 7 Nov 2006 14:33:29 -0000 1.25.50.2 @@ -32,6 +32,9 @@ * MA 02111-1307 USA * ************************************************************************/ + +// MARKER(update_precomp.py): autogen include statement, do not remove +#include "precompiled_dbaccess.hxx" #ifndef DBAUI_HTMLREADER_HXX #include "HtmlReader.hxx" #endif @@ -41,6 +44,9 @@ #ifndef _CONNECTIVITY_DBTOOLS_HXX_ #include <connectivity/dbtools.hxx> #endif +#ifndef _TOOLS_TENCCVT_HXX +#include <tools/tenccvt.hxx> +#endif #ifndef _COMPHELPER_EXTRACT_HXX_ #include <comphelper/extract.hxx> #endif @@ -50,9 +56,18 @@ #ifndef DBACCESS_SHARED_DBUSTRINGS_HRC #include "dbustrings.hrc" #endif +#ifndef _SFXDOCINF_HXX +#include <sfx2/docinf.hxx> +#endif +#ifndef _SFXHTML_HXX +#include <sfx2/sfxhtml.hxx> +#endif #ifndef _TOOLS_DEBUG_HXX #include <tools/debug.hxx> #endif +#ifndef _TOOLS_TENCCVT_HXX +#include <tools/tenccvt.hxx> +#endif #ifndef _DBAUI_MODULE_DBU_HXX_ #include "moduledbu.hxx" #endif @@ -201,12 +216,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::OHTMLReader(SvStream& rIn, @@ -225,12 +237,9 @@ ,m_bSDNum(sal_False) { DBG_CTOR(OHTMLReader,NULL); - // If the system encoding is ANSI, this encoding is used as default - // source encoding. Otherwise ISO-8859-1 will be used, because this - // is the real default encoding. - SetSrcEncoding( RTL_TEXTENCODING_MS_1252 == gsl_getSystemTextEncoding() - ? RTL_TEXTENCODING_MS_1252 - : RTL_TEXTENCODING_ISO_8859_1 ); + SetSrcEncoding( GetExtendedCompatibilityTextEncoding( RTL_TEXTENCODING_ISO_8859_1 ) ); + // If the file starts with a BOM, switch to UCS2. + SetSwitchToUCS2( TRUE ); } // --------------------------------------------------------------------------- OHTMLReader::~OHTMLReader() @@ -248,33 +257,12 @@ return m_bFoundTable ? eParseState : SVPAR_ERROR; } // ----------------------------------------------------------------------------- -rtl_TextEncoding OHTMLReader::GetEncodingByMIME( const String& rMime ) -{ - DBG_CHKTHIS(OHTMLReader,NULL); - ByteString sType; - ByteString sSubType; - INetContentTypeParameterList aParameters; - ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); - if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) - { - const INetContentTypeParameter * pCharset - = aParameters.find("charset"); - if (pCharset != 0) - { - ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); - return rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ); - } - } - return RTL_TEXTENCODING_DONTKNOW; -} - -// --------------------------------------------------------------------------- void OHTMLReader::NextToken( int nToken ) { DBG_CHKTHIS(OHTMLReader,NULL); if(m_bError || !m_nRows) // falls Fehler oder keine Rows mehr zur "Uberpr"ufung dann gleich zur"uck return; - if(!m_bMetaOptions) + if ( nToken == HTML_META ) setTextEncoding(); if(m_xConnection.is()) // gibt an welcher CTOR gerufen wurde und damit, ob eine Tabelle erstellt werden soll @@ -672,43 +660,21 @@ DBG_CHKTHIS(OHTMLReader,NULL); m_bMetaOptions = sal_True; USHORT nContentOption = HTML_O_CONTENT; - String aName, aContent; - USHORT nAction = HTML_META_NONE; - BOOL bHTTPEquiv = FALSE; - const HTMLOptions *pHtmlOptions = GetOptions(&nContentOption); - for( USHORT i = pHtmlOptions->Count(); i; ) - { - const HTMLOption *pOption = (*pHtmlOptions)[ --i ]; - switch( pOption->GetToken() ) - { - case HTML_O_HTTPEQUIV: - aName = pOption->GetString(); - pOption->GetEnum( nAction, getOptions() ); - bHTTPEquiv = TRUE; - break; - case HTML_O_CONTENT: - aContent = pOption->GetString(); - break; - } - } - if( bHTTPEquiv || HTML_META_DESCRIPTION!=nAction ) - { - // wenn's keine Description ist CRs und LFs aus dem CONTENT entfernen - aContent.EraseAllChars( _CR ); - aContent.EraseAllChars( _LF ); - } - else - { - // fuer die Beschreibung die Zeilen-Umbrueche entsprechen wandeln - aContent.ConvertLineEnd(); - } - switch( nAction ) - { - case HTML_META_CONTENT_TYPE: - if( aContent.Len() ) - SetSrcEncoding(GetEncodingByMIME( aContent )); - break; - } + rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; + USHORT nMetaTags = 0; + + ::std::auto_ptr<SfxDocumentInfo> pInfo(new SfxDocumentInfo()); + SfxHTMLParser::ParseMetaOptions( pInfo.get(), NULL, + GetOptions(&nContentOption), + nMetaTags, eEnc ); + + // If the encoding is set by a META tag, it may only overwrite the + // current encoding if both, the current and the new encoding, are 1-BYTE + // encodings. Everything else cannot lead to reasonable results. + if( RTL_TEXTENCODING_DONTKNOW != eEnc && + rtl_isOctetTextEncoding( eEnc ) && + rtl_isOctetTextEncoding( GetSrcEncoding() ) ) + SetSrcEncoding( eEnc ); } // ----------------------------------------------------------------------------- void OHTMLReader::release() @@ -723,3 +689,4 @@ return new OWizHTMLExtend(_pParent,rInput); } // ----------------------------------------------------------------------------- + --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]
