sc/Library_scui.mk | 3 + sc/source/ui/dbgui/scuiasciiopt.cxx | 71 +++++++++++++++++++----------------- 2 files changed, 41 insertions(+), 33 deletions(-)
New commits: commit 85f12e47f4a086a3923dd3a6b097776d60c6dc82 Author: Tomofumi Yagi <[email protected]> AuthorDate: Sat Sep 12 11:47:10 2020 +0900 Commit: Noel Grandin <[email protected]> CommitDate: Sun Sep 13 13:21:43 2020 +0200 Calc: ScImportAsciiDlg can now detect Unicode encoding without BOM Change-Id: I8a3aa7458ce97f659c0caf2386a96f605b740fbc Reviewed-on: https://gerrit.libreoffice.org/c/core/+/102543 Tested-by: Jenkins Reviewed-by: Noel Grandin <[email protected]> diff --git a/sc/Library_scui.mk b/sc/Library_scui.mk index a8c2097485b0..86605ab63a0d 100644 --- a/sc/Library_scui.mk +++ b/sc/Library_scui.mk @@ -39,6 +39,9 @@ $(eval $(call gb_Library_use_externals,scui,\ $(call gb_Helper_optional,OPENCL, \ clew) \ mdds_headers \ + icui18n \ + icuuc \ + icu_headers \ )) $(eval $(call gb_Library_use_libraries,scui,\ diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx index a0e645e551e0..5e5f08bf87a7 100644 --- a/sc/source/ui/dbgui/scuiasciiopt.cxx +++ b/sc/source/ui/dbgui/scuiasciiopt.cxx @@ -37,6 +37,9 @@ #include <miscuno.hxx> #include <osl/diagnose.h> +#include <unicode/uclean.h> +#include <unicode/ucsdet.h> + //! TODO make dynamic const SCSIZE ASCIIDLG_MAXROWS = MAXROWCOUNT; @@ -380,41 +383,43 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa // Sniff for Unicode / not if( ePreselectUnicode == RTL_TEXTENCODING_DONTKNOW && mpDatStream ) { - Seek( 0 ); - mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); - sal_uLong nUniPos = mpDatStream->Tell(); - switch (nUniPos) + mpDatStream->Seek( 0 ); + constexpr size_t buffsize = 4096; + sal_Int8 bytes[buffsize] = { 0 }; + sal_Int32 nRead = mpDatStream->ReadBytes( bytes, buffsize ); + mpDatStream->Seek( 0 ); + + if ( nRead > 0 ) { - case 2: - ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16 - break; - case 3: - ePreselectUnicode = RTL_TEXTENCODING_UTF8; // UTF-8 - break; - case 0: - { - sal_uInt16 n; - mpDatStream->ReadUInt16( n ); - // Assume that normal ASCII/ANSI/ISO/etc. text doesn't start with - // control characters except CR,LF,TAB - if ( (n & 0xff00) < 0x2000 ) - { - switch ( n & 0xff00 ) - { - case 0x0900 : - case 0x0a00 : - case 0x0d00 : - break; - default: - ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16 - } - } - mpDatStream->Seek(0); - } - break; - default: - ; // nothing + UErrorCode uerr = U_ZERO_ERROR; + UCharsetDetector* ucd = ucsdet_open( &uerr ); + ucsdet_setText( ucd, reinterpret_cast<const char*>(bytes), nRead, &uerr ); + const UCharsetMatch* match = ucsdet_detect( ucd, &uerr ); + const char* pEncodingName = ucsdet_getName( match, &uerr ); + + if ( U_SUCCESS(uerr) && !strcmp("UTF-8", pEncodingName) ) + { + ePreselectUnicode = RTL_TEXTENCODING_UTF8; // UTF-8 + mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UTF8 ); + } + else if ( U_SUCCESS(uerr) && !strcmp("UTF-16LE", pEncodingName) ) + { + ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16LE + mpDatStream->SetEndian( SvStreamEndian::LITTLE ); + mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE ); + } + else if ( U_SUCCESS(uerr) && !strcmp("UTF-16BE", pEncodingName) ) + { + ePreselectUnicode = RTL_TEXTENCODING_UNICODE; // UTF-16BE + mpDatStream->SetEndian(SvStreamEndian::BIG); + mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_UNICODE ); + } + else // other + mpDatStream->StartReadingUnicodeText( RTL_TEXTENCODING_DONTKNOW ); + + ucsdet_close( ucd ); } + mnStreamPos = mpDatStream->Tell(); } _______________________________________________ Libreoffice-commits mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits
