[Libreoffice-commits] core.git: sal/qa sal/textenc

2019-09-11 Thread Stephan Bergmann (via logerrit)
 sal/qa/rtl/textenc/rtl_textcvt.cxx |6 +++---
 sal/textenc/tcvtjp2.tab|4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

New commits:
commit 27808e6e3ed049dda09f552b7769a4e87a82283a
Author: Stephan Bergmann 
AuthorDate: Fri Sep 6 16:25:40 2019 +0200
Commit: Stephan Bergmann 
CommitDate: Wed Sep 11 19:25:49 2019 +0200

Fix Unicode to Shift JIS/MS932 conversion data

These are MS932 extensions, and per

("Table version: 2.01", "Date: 04/15/98"), U+4F92 is a mapping for 0xFA6F 
(and
also for 0xED53, which is also an MS932 extension, and "loses" here), and
U+4F9A is a mapping for 0xFA71 (and also for 0xED55, which is also an MS932
extension, and "loses" here).  (And neither U+4F92 nor U+4F9A appear as 
mappings
in 
,
"Table version: 2.0", "Date: 2011 October 14 (header updated: 2015
December 02)".)

This appears to be a typo dating back to
9399c662f36c385b0c705eb34e636a9aec450282 "initial import".

Change-Id: I0c699675355d839e62d6e4082355a2d67472533e
Reviewed-on: https://gerrit.libreoffice.org/78720
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann 

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx 
b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index 068e727d53cb..795950a0347a 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -1765,9 +1765,9 @@ void Test::testComplex() {
   RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
 { RTL_TEXTENCODING_SHIFT_JIS,
-  RTL_CONSTASCII_STRINGPARAM("\x00"),
-  {0x},
-  1,
+  RTL_CONSTASCII_STRINGPARAM("\x00\xFA\x6F\xFA\x71"),
+  {0x, 0x4F92, 0x4F9A},
+  3,
   true,
   true,
   true,
diff --git a/sal/textenc/tcvtjp2.tab b/sal/textenc/tcvtjp2.tab
index fc9b0a3d98d7..1e4716b168bf 100644
--- a/sal/textenc/tcvtjp2.tab
+++ b/sal/textenc/tcvtjp2.tab
@@ -717,8 +717,8 @@ static sal_uInt16 const aImplUniToDBCSTab_SJIS_4F[] =
  0,  0,  0, 0x98CD, 0x8CF1,  0,  0, 0x8E67, /* 0x70 */
  0,  0,  0, 0x8AA4,  0,  0, 0x98D2,  0, /* 0x80 */
 0x98CA,  0, 0xFA70, 0x97E1,  0, 0x8E98,  0, 0x98CB, /* 0x80 */
- 0, 0x98D0, 0xFA71,  0, 0xFA72,  0, 0x98D3,  0, /* 0x90 */
-0x98CC,  0, 0xFA6F, 0x8B9F,  0, 0x88CB,  0,  0, /* 0x90 */
+ 0, 0x98D0, 0xFA6F,  0, 0xFA72,  0, 0x98D3,  0, /* 0x90 */
+0x98CC,  0, 0xFA71, 0x8B9F,  0, 0x88CB,  0,  0, /* 0x90 */
 0x8BA0, 0x89BF,  0,  0,  0,  0,  0,  0, /* 0xA0 */
  0,  0,  0, 0x9B44,  0, 0x9699, 0x958E, 0x8CF2, /* 0xA0 */
  0,  0,  0,  0,  0, 0x904E, 0x97B5,  0, /* 0xB0 */
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

[Libreoffice-commits] core.git: sal/qa sal/textenc

2019-09-05 Thread Stephan Bergmann (via logerrit)
 sal/qa/rtl/textenc/rtl_textcvt.cxx |9 +
 sal/textenc/tcvtmb.cxx |2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

New commits:
commit 411e4d15ff278c4fec77126f29916b5600aefbfb
Author: Stephan Bergmann 
AuthorDate: Thu Sep 5 09:57:32 2019 +0200
Commit: Stephan Bergmann 
CommitDate: Thu Sep 5 11:11:29 2019 +0200

Fix conversion of U+ in ImplUnicodeToDBCS

...which appears to have been broken when
13824735057ef25075af8fd0ddb8f14e34c7eeb6 "#81346# - Fix for unconverted
characters for DBCS encodings" moved that "if" out of surrounding "if" 
block.
(And, for consistency, write the "if" check in the same way as the 
preceding one
is written since 739cb04c36524c5a1bbf768dfe93624a1b2ec8b4 "#97705# Fixed 
mapping
of Big5 EUDC points.")

Change-Id: I4324197c4eba671ab6313fb89f988da102b8ffa5
Reviewed-on: https://gerrit.libreoffice.org/78627
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann 

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx 
b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index af9ccca345e7..068e727d53cb 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -1765,6 +1765,15 @@ void Test::testComplex() {
   RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
 #if WITH_LOCALE_ALL || WITH_LOCALE_ja
 { RTL_TEXTENCODING_SHIFT_JIS,
+  RTL_CONSTASCII_STRINGPARAM("\x00"),
+  {0x},
+  1,
+  true,
+  true,
+  true,
+  false,
+  RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
+{ RTL_TEXTENCODING_SHIFT_JIS,
   RTL_CONSTASCII_STRINGPARAM(
   "\x87\x40\x87\x41\x87\x42\x87\x43\x87\x44\x87\x45\x87\x46"
   "\x87\x47\x87\x48\x87\x49\x87\x4A\x87\x4B\x87\x4C\x87\x4D"
diff --git a/sal/textenc/tcvtmb.cxx b/sal/textenc/tcvtmb.cxx
index 4e990cc1e91e..89e89c56c628 100644
--- a/sal/textenc/tcvtmb.cxx
+++ b/sal/textenc/tcvtmb.cxx
@@ -320,7 +320,7 @@ sal_Size ImplUnicodeToDBCS( const void* pData, 
SAL_UNUSED_PARAMETER void*,
 }
 }
 
-if ( !cConv )
+if (cConv == 0 && c != 0)
 {
 if ( nFlags & RTL_UNICODETOTEXT_FLAGS_UNDEFINED_REPLACE )
 {
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

[Libreoffice-commits] core.git: sal/qa sal/textenc

2019-09-04 Thread Stephan Bergmann (via logerrit)
 sal/qa/rtl/textenc/rtl_textcvt.cxx|   30 ++
 sal/textenc/convertbig5hkscs.cxx  |   12 ++
 sal/textenc/converteuctw.cxx  |   14 +++-
 sal/textenc/convertgb18030.cxx|   14 +++-
 sal/textenc/convertisciidevangari.cxx |   17 ++
 sal/textenc/convertiso2022cn.cxx  |   14 +++-
 sal/textenc/convertiso2022jp.cxx  |   14 +++-
 sal/textenc/convertiso2022kr.cxx  |   14 +++-
 sal/textenc/convertsinglebytetobmpunicode.cxx |   12 ++
 sal/textenc/tcvtutf8.cxx  |   11 ++---
 sal/textenc/unichars.hxx  |8 --
 11 files changed, 110 insertions(+), 50 deletions(-)

New commits:
commit cd563e7b807fe038ebefb228e70bc587c040d17d
Author: Stephan Bergmann 
AuthorDate: Wed Sep 4 14:31:30 2019 +0200
Commit: Stephan Bergmann 
CommitDate: Wed Sep 4 19:56:33 2019 +0200

Do not exclude Unicode noncharacters from rtl_convertUnicodeToText

For one, that broke round-tripping with e.g. UTF-8 (see the test case added 
to
Test::testComplex in sal/qa/rtl/textenc/rtl_textcvt.cxx) which did not treat
noncharacters as invalid.

For another,  is 
meanwhile
quite clear on the matter:

"Q: Are noncharacters prohibited in interchange?

"A: This question has led to some controversy, because the Unicode Standard 
has
been somewhat ambiguous about the status of noncharacters. The formal 
wording of
the definition of 'noncharacter' in the standard has always indicated that
noncharacters 'should never be interchanged.' That led some people to assume
that the definition actually meant 'shall not be interchanged' and that
therefore the presence of a noncharacter in any Unicode string immediately
rendered that string malformed according to the standard. But the intended 
use
of noncharacters requires the ability to exchange them in a limited 
context, at
least across APIs and even through data files and other means of 
'interchange',
so that they can be processed as intended. The choice of the word 'should' 
in
the original definition was deliberate, and indicated that one should not 
try to
interchange noncharacters precisely because their interpretation is strictly
internal to whatever implementation uses them, so they have no publicly
interchangeable semantics. But other informative wording in the text of the 
core
specification and in the character names list was differently and more 
strongly
worded, leading to contradictory interpretations.

"Given this ambiguity of intent, in 2013 the UTC issued Corrigendum #9, 
which
deleted the phrase 'and that should never be interchanged' from the 
definition
of noncharacters, to make it clear that prohibition from interchange is not 
part
of the formal definition of noncharacters. Corrigendum #9 has been 
incorporated
into the core specification for Unicode 7.0.

"Q: Are noncharacters invalid in Unicode strings and UTFs?

"A: Absolutely not. Noncharacters do not cause a Unicode string to be 
ill-formed
in any UTF. This can be seen explicitly in the table above, where every
noncharacter code point has a well-formed representation in UTF-32, in 
UTF-16,
and in UTF-8. An implementation which converts noncharacter code points 
between
one UTF representation and another must preserve these values correctly. The
fact that they are called 'noncharacters' and are not intended for open
interchange does not mean that they are somehow illegal or invalid code 
points
which make strings containing them invalid."

Change-Id: I4fcc0156e3d2fd305a7c7bb0c7b3dbef846c9e64
Reviewed-on: https://gerrit.libreoffice.org/78598
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann 

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx 
b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index 2f5359b32c77..af9ccca345e7 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -457,6 +457,8 @@ public:
 
 void testInvalidUtf8();
 
+void testInvalidUnicode();
+
 void testSRCBUFFERTOSMALL();
 
 void testMime();
@@ -471,6 +473,7 @@ public:
 CPPUNIT_TEST(testComplexCut);
 CPPUNIT_TEST(testInvalidUtf7);
 CPPUNIT_TEST(testInvalidUtf8);
+CPPUNIT_TEST(testInvalidUnicode);
 CPPUNIT_TEST(testSRCBUFFERTOSMALL);
 CPPUNIT_TEST(testMime);
 CPPUNIT_TEST(testWindows);
@@ -2336,6 +2339,15 @@ void Test::testComplex() {
   true,
   false,
   RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
+{ RTL_TEXTENCODING_UTF8,
+  RTL_CONSTASCII_STRINGPARAM("\xEF\xBF\xBF"),
+  {0x},
+  1,
+  false,
+  true,
+  true,

[Libreoffice-commits] core.git: sal/qa sal/textenc

2019-09-03 Thread Stephan Bergmann (via logerrit)
 sal/qa/rtl/textenc/rtl_textcvt.cxx |   21 +
 sal/textenc/tcvtutf7.cxx   |   17 ++---
 2 files changed, 31 insertions(+), 7 deletions(-)

New commits:
commit 238f6f42b381198e14f6d17649d8465425c7450f
Author: Stephan Bergmann 
AuthorDate: Tue Sep 3 15:50:09 2019 +0200
Commit: Stephan Bergmann 
CommitDate: Tue Sep 3 20:18:28 2019 +0200

Fix handling of invalid bytes >= 0x80 in ImplUTF7ToUnicode

Change-Id: I08838f9ae34a31712d7269ddaaee3fe59ece2178
Reviewed-on: https://gerrit.libreoffice.org/78562
Tested-by: Jenkins
Reviewed-by: Stephan Bergmann 

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx 
b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index 339075decba3..6b5a7e55fe21 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
 
 void testComplexCut();
 
+void testInvalidUtf7();
+
 void testInvalidUtf8();
 
 void testSRCBUFFERTOSMALL();
@@ -467,6 +469,7 @@ public:
 CPPUNIT_TEST(testSingleByte);
 CPPUNIT_TEST(testComplex);
 CPPUNIT_TEST(testComplexCut);
+CPPUNIT_TEST(testInvalidUtf7);
 CPPUNIT_TEST(testInvalidUtf8);
 CPPUNIT_TEST(testSRCBUFFERTOSMALL);
 CPPUNIT_TEST(testMime);
@@ -2638,6 +2641,24 @@ void Test::testComplexCut() {
 #endif
 }
 
+void Test::testInvalidUtf7() {
+auto const converter = 
rtl_createTextToUnicodeConverter(RTL_TEXTENCODING_UTF7);
+CPPUNIT_ASSERT(converter != nullptr);
+sal_Unicode buf[TEST_STRING_SIZE];
+sal_uInt32 info;
+sal_Size converted;
+auto const size = rtl_convertTextToUnicode(
+converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80"), buf, 
TEST_STRING_SIZE,
+(RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR | 
RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT | 
RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+, );
+CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+CPPUNIT_ASSERT_EQUAL(OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+CPPUNIT_ASSERT_EQUAL(sal_Size(1), converted);
+rtl_destroyTextToUnicodeConverter(converter);
+}
+
 void Test::testInvalidUtf8() {
 // UTF-8, invalid bytes:
 {
diff --git a/sal/textenc/tcvtutf7.cxx b/sal/textenc/tcvtutf7.cxx
index 46c135859566..dd97b213750b 100644
--- a/sal/textenc/tcvtutf7.cxx
+++ b/sal/textenc/tcvtutf7.cxx
@@ -358,15 +358,18 @@ sal_Size ImplUTF7ToUnicode( SAL_UNUSED_PARAMETER const 
void*, void* pContext,
 = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER;
 }
 }
-
-/* Write char to unicode buffer */
-if ( pDestBuf >= pEndDestBuf )
+else
 {
-*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | 
RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
-break;
+/* Write char to unicode buffer */
+if ( pDestBuf >= pEndDestBuf )
+{
+*pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | 
RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOOSMALL;
+break;
+}
+*pDestBuf = c;
+pDestBuf++;
+
 }
-*pDestBuf = c;
-pDestBuf++;
 }
 }
 
___
Libreoffice-commits mailing list
libreoffice-comm...@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libreoffice-commits

[Libreoffice-commits] core.git: sal/qa sal/textenc sw/qa writerfilter/source

2017-09-24 Thread Stephan Bergmann
 sal/qa/rtl/textenc/rtl_textcvt.cxx |1 +
 sal/textenc/tencinfo.cxx   |2 ++
 sw/qa/extras/rtfexport/rtfexport2.cxx  |2 +-
 writerfilter/source/rtftok/rtfdocumentimpl.cxx |6 +-
 4 files changed, 9 insertions(+), 2 deletions(-)

New commits:
commit 53b96765c555146e5c6a3a614420bfeeebc92b58
Author: Stephan Bergmann 
Date:   Tue Sep 19 16:26:24 2017 +0200

Map Windows code page 42 to RTL_TEXTENCODING_SYMBOL

 "WideCharToMultiByte function" suggests that there 
now
is CP_SYMBOL, "Windows 2000: Symbol code page (42)."  And a little test 
program
on Windows indicates that our RTL_TEXTENCODING_SYMBOL is working the same 
way as
CP_SYMBOL, where MultiByteToWideChar maps 00..1F to U+..1F and 20..FF to
U+F020..F0FF.

At least CppunitTest_writerfilter_rtftok, when testing
writerfilter/qa/cppunittests/rtftok/data/pass/EDB-18940-1.rtf, goes into 
case
RTF_FCHARSET in RTFDocumentImpl::dispatchValue
(writerfilter/source/rtftok/rtfdispatchvalue.cxx) with nParam matching
aRTFEncodings[2] (i.e., a mapping from charset 2 to codepage 42, see
writerfilter/source/rtftok/rtfcharsets.cxx), then passes 42 into
rtl_getTextEncodingFromWindowsCodePage and obtains an unhelpful
RTL_TEXTENCODING_DONTKNOW.

testFdo72031 (sw/qa/extras/rtfexport/rtfexport2.cxx, 
CppunitTest_sw_rtfexport2)
needed to be adapted, as the circled plus from the Symbol font is now 
internally
represented as U+F0C5, not (somewhat bogusly) as U+00C5 (aka LATIN CAPTIAL
LETTER A WITH RING ABOVE).  But, when displayed with the Symobl font, the 
glyph
that is actually shown remains the circled plus.

Turns out changing rtl_getTextEncodingFromWindowsCodePage would start to 
make
CppunitTest_sw_rtfimport fail:

  Sep 20 15:49:24  vmiklos, with
   , testN823675
   (sw/qa/extras/rtfimport/rtfimport.cxx) fails, the aFont.Name is not 
"Symbol";
   sw/qa/extras/rtfimport/data/n823675.rtf contains a \fonttbl that 
specifies
   \f3 to have \fcharset2 (i.e., symbol font) and fontname "Symbol".  
However,
   RTFDocumentImpl::checkUnicode
   (writerfilter/source/rtftok/rtfdocumentimpl.cxx)
   converts m_aHexBuffer (containing "Symbol;") with nCurrentEncoding 
apparently
   being the encoding specified by \fcharset2 (i.e., now 
RTL_TEXTENCODING_SYMBOL
   instead of old RTL_TEXTENCODING_DONTKNOW), so the resulting OUString is
   garbage
   (instead of the byte-for-byte conversion to Unicode "Symbol;" that
   RTL_TEXTENCODING_DONTKNOW would do there); do you know whether such 
\fonttbl
   fontnames should actually be interpreted in the given \fcharset?
  Sep 20 15:49:24  gerrit: »Map Windows code page 42 to
   RTL_TEXTENCODING_SYMBOL« by Stephan Bergmann for master [NEW]
  Sep 20 15:51:15  sberg: let me check if the spec covers that
  Sep 20 15:54:29  sberg: i think the name is typically encoded in the
   font's encoding but probably they have to make a (likely undocumented)
   exception for symbol encoding
  Sep 20 15:57:46  sberg: the spec only says that \fcharset is 
about
   the encoding of the content using that font, i don't see it described 
what
   would be the encoding of the font name itself
  Sep 20 15:58:51  sberg: i'm not sure about if that encoding 
should or
   should not affect the encoding of the font name in general, but indeed at
   least for 2 (symbol encoding) you're right, Word doesn't encoding the 
font
   name with that encoding, either.
  Sep 20 15:59:30  vmiklos, mst_, at the top of page 14 of
   Word2007RTFSpec9.docx I see "Note that runs of text marked with a 
particular
   font index (see \fN in the Font Table section) use the codepage for that 
font
   as given by \cpgN or implied by \fcharsetN, unless they use Unicode RTF
   described in the following section."  Would that match what mst_ says?
  Sep 20 15:59:33  so if it helps you case to handle at as e.g. 
ascii,
   just for that encoding, i think there would be no problem with that.
  Sep 20 16:00:07  sberg: that still talks about the content using 
the
   font, not the strings (font names) in the font table itself, i think.
  Sep 20 16:01:17  vmiklos, what's the control word to select such a
   font, also \fN?  I don't see any such in n823675.rtf
  Sep 20 16:02:16  loircbot: e.g. \af3
  Sep 20 16:02:31  sberg: ^
  Sep 20 16:02:47  04d5a280beeeb6e056df68395dc9c3b3a674361b
  Sep 20 16:02:50  core - related: fdo#77979: writerfilter RTF 
import:
   read encoded font name -
   
http://cgit.freedesktop.org/libreoffice/core/commit/?id=04d5a280beeeb6e056df68395dc9c3b3a674361b
  Sep 20 16:02:52  sberg: ^
  Sep 20 

[Libreoffice-commits] core.git: sal/qa sal/textenc

2017-09-13 Thread Stephan Bergmann
 sal/qa/rtl/textenc/rtl_textcvt.cxx |  332 +
 sal/textenc/tcvtutf8.cxx   |   73 +---
 2 files changed, 352 insertions(+), 53 deletions(-)

New commits:
commit 08e78607ec6bc820c52ab3df1a5d3738e049b90d
Author: Stephan Bergmann 
Date:   Wed Sep 13 08:28:32 2017 +0200

Make reading UTF-8 strict

Consider non-shortest forms, surrogates, and representations of values 
larger
than 0x10 (which can even cover five or six bytes, for historical 
reasons)
as "invalid" (they used to be considered as "undefined" instead).

This is in response to fc670f637d4271246691904fd649358ce2e7be59 "svtools: 
HTML
import: don't put lone surrogates in OUString" (which can now be reverted 
again
in a follow-up commit).  My fear would have been that some places in the 
code
rely on the original, relaxed handling, but at least 'make check' still
succeeded for me.

Change-Id: I017e6c04ed3c577c3694b417167f853987a1d1ce

diff --git a/sal/qa/rtl/textenc/rtl_textcvt.cxx 
b/sal/qa/rtl/textenc/rtl_textcvt.cxx
index d698bc22cd74..3c36852bebfc 100644
--- a/sal/qa/rtl/textenc/rtl_textcvt.cxx
+++ b/sal/qa/rtl/textenc/rtl_textcvt.cxx
@@ -453,6 +453,8 @@ public:
 
 void testComplexCut();
 
+void testInvalidUtf8();
+
 void testSRCBUFFERTOSMALL();
 
 void testMime();
@@ -465,6 +467,7 @@ public:
 CPPUNIT_TEST(testSingleByte);
 CPPUNIT_TEST(testComplex);
 CPPUNIT_TEST(testComplexCut);
+CPPUNIT_TEST(testInvalidUtf8);
 CPPUNIT_TEST(testSRCBUFFERTOSMALL);
 CPPUNIT_TEST(testMime);
 CPPUNIT_TEST(testWindows);
@@ -2330,35 +2333,6 @@ void Test::testComplex() {
   true,
   false,
   RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-{ RTL_TEXTENCODING_UTF8,
-  RTL_CONSTASCII_STRINGPARAM(
-  "\xC0\x80\xE0\x80\x81\xF0\x80\x80\x82\xF8\x80\x80\x80\x83"
-  "\xFC\x80\x80\x80\x80\x84"),
-  { 0x,0x0001,0x0002,0x0003,0x0004 },
-  5,
-  false,
-  true,
-  false,
-  false,
-  RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-{ RTL_TEXTENCODING_UTF8,
-  RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\xED\xB4\x93"),
-  { 0xD849,0xDD13 },
-  2,
-  false,
-  true,
-  false,
-  false,
-  RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
-{ RTL_TEXTENCODING_UTF8,
-  RTL_CONSTASCII_STRINGPARAM("\xED\xA1\x89\x41"),
-  { 0xD849,0x0041 },
-  2,
-  false,
-  true,
-  false,
-  false,
-  RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR },
 
 // Test Java UTF-8:
 
@@ -2664,6 +2638,306 @@ void Test::testComplexCut() {
 #endif
 }
 
+void Test::testInvalidUtf8() {
+// UTF-8, invalid bytes:
+{
+auto const converter = rtl_createTextToUnicodeConverter(
+RTL_TEXTENCODING_UTF8);
+CPPUNIT_ASSERT(converter != nullptr);
+sal_Unicode buf[TEST_STRING_SIZE];
+sal_uInt32 info;
+sal_Size converted;
+auto const size = rtl_convertTextToUnicode(
+converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\x80\xBF\xFE\xFF"),
+buf, TEST_STRING_SIZE,
+(RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+, );
+CPPUNIT_ASSERT_EQUAL(sal_Size(4), size);
+CPPUNIT_ASSERT_EQUAL(
+OUString(u"\uFFFD\uFFFD\uFFFD\uFFFD"),
+OUString(buf, sal_Int32(size)));
+CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+CPPUNIT_ASSERT_EQUAL(sal_Size(4), converted);
+rtl_destroyTextToUnicodeConverter(converter);
+}
+// UTF-8, non-shortest two-byte sequence:
+{
+auto const converter = rtl_createTextToUnicodeConverter(
+RTL_TEXTENCODING_UTF8);
+CPPUNIT_ASSERT(converter != nullptr);
+sal_Unicode buf[TEST_STRING_SIZE];
+sal_uInt32 info;
+sal_Size converted;
+auto const size = rtl_convertTextToUnicode(
+converter, nullptr, RTL_CONSTASCII_STRINGPARAM("\xC0\x80"),
+buf, TEST_STRING_SIZE,
+(RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
+ | RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT
+ | RTL_TEXTTOUNICODE_FLAGS_FLUSH),
+, );
+CPPUNIT_ASSERT_EQUAL(sal_Size(1), size);
+CPPUNIT_ASSERT_EQUAL(
+OUString(u"\uFFFD"), OUString(buf, sal_Int32(size)));
+CPPUNIT_ASSERT_EQUAL(RTL_TEXTTOUNICODE_INFO_INVALID, info);
+CPPUNIT_ASSERT_EQUAL(sal_Size(2), converted);
+  

[Libreoffice-commits] core.git: sal/qa sal/textenc sax/inc scripting/source

2014-06-01 Thread Jens Carl
 sal/qa/inc/valueequal.hxx |5 +
 sal/textenc/convertisciidevangari.hxx |5 +
 sax/inc/xml2utf.hxx   |5 +
 scripting/source/dlgprov/DialogModelProvider.hxx  |6 ++
 scripting/source/provider/BrowseNodeFactoryImpl.hxx   |5 +
 scripting/source/provider/MasterScriptProviderFactory.hxx |6 ++
 6 files changed, 32 insertions(+)

New commits:
commit f3e1f476e9cffb75d0620ab2dcfdc1ea077cd9d3
Author: Jens Carl j.car...@gmx.de
Date:   Fri May 30 21:07:48 2014 +

fdo#68849: Add header guards to all include files

Added header guards to files in directories sal/, sal/, and scripting/

Change-Id: Ieb7f224f2d27bd671618c516f47f5b7f08c1d294
Reviewed-on: https://gerrit.libreoffice.org/9582
Reviewed-by: Thomas Arnhold tho...@arnhold.org
Tested-by: Thomas Arnhold tho...@arnhold.org

diff --git a/sal/qa/inc/valueequal.hxx b/sal/qa/inc/valueequal.hxx
index 563314d..0a11a2a 100644
--- a/sal/qa/inc/valueequal.hxx
+++ b/sal/qa/inc/valueequal.hxx
@@ -17,6 +17,9 @@
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
 
+#ifndef INCLUDED_SAL_QA_INC_VALUEEQUAL_HXX
+#define INCLUDED_SAL_QA_INC_VALUEEQUAL_HXX
+
 #include sal/config.h
 
 #include math.h
@@ -114,4 +117,6 @@ bool is_double_equal(double x, double y)
 return is_equaldouble(x, y, PREC_double);
 }
 
+#endif // INCLUDED_SAL_QA_INC_VALUEEQUAL_HXX
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sal/textenc/convertisciidevangari.hxx 
b/sal/textenc/convertisciidevangari.hxx
index b3f5f30..89aeeae 100644
--- a/sal/textenc/convertisciidevangari.hxx
+++ b/sal/textenc/convertisciidevangari.hxx
@@ -7,6 +7,9 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
+#ifndef INCLUDED_SAL_TEXTENC_CONVERTISCIIDEVANGARI_HXX
+#define INCLUDED_SAL_TEXTENC_CONVERTISCIIDEVANGARI_HXX
+
 #include sal/types.h
 
 sal_Size ImplConvertIsciiDevanagariToUnicode(void const * pData,
@@ -31,4 +34,6 @@ void ImplResetUnicodeToIsciiDevanagariContext(void * 
pContext);
 
 void ImplDestroyUnicodeToIsciiDevanagariContext(void * pContext);
 
+#endif // INCLUDED_SAL_TEXTENC_CONVERTISCIIDEVANGARI_HXX
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sax/inc/xml2utf.hxx b/sax/inc/xml2utf.hxx
index 0fa9fc7..526dd13 100644
--- a/sax/inc/xml2utf.hxx
+++ b/sax/inc/xml2utf.hxx
@@ -17,6 +17,9 @@
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
 
+#ifndef INCLUDED_SAX_INC_XML2UTF_HXX
+#define INCLUDED_SAX_INC_XML2UTF_HXX
+
 #include sal/types.h
 
 namespace sax_expatwrap {
@@ -131,4 +134,6 @@ private:
 };
 }
 
+#endif // INCLUDED_SAX_INC_XML2UTF_HXX
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/scripting/source/dlgprov/DialogModelProvider.hxx 
b/scripting/source/dlgprov/DialogModelProvider.hxx
index 87e7c4d..2a0696b 100644
--- a/scripting/source/dlgprov/DialogModelProvider.hxx
+++ b/scripting/source/dlgprov/DialogModelProvider.hxx
@@ -16,6 +16,10 @@
  *   except in compliance with the License. You may obtain a copy of
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
+
+#ifndef INCLUDED_SCRIPTING_SOURCE_DLGPROV_DIALOGMODELPROVIDER_HXX
+#define INCLUDED_SCRIPTING_SOURCE_DLGPROV_DIALOGMODELPROVIDER_HXX
+
 #include sal/config.h
 #include cppuhelper/factory.hxx
 #include cppuhelper/implbase4.hxx
@@ -81,4 +85,6 @@ private:
 };
 } // closing anonymous implementation namespace
 
+#endif // INCLUDED_SCRIPTING_SOURCE_DLGPROV_DIALOGMODELPROVIDER_HXX
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/scripting/source/provider/BrowseNodeFactoryImpl.hxx 
b/scripting/source/provider/BrowseNodeFactoryImpl.hxx
index 4ceeecb..35dd242 100644
--- a/scripting/source/provider/BrowseNodeFactoryImpl.hxx
+++ b/scripting/source/provider/BrowseNodeFactoryImpl.hxx
@@ -17,6 +17,9 @@
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
 
+#ifndef INCLUDED_SCRIPTING_SOURCE_PROVIDER_BROWSENODEFACTORYIMPL_HXX
+#define INCLUDED_SCRIPTING_SOURCE_PROVIDER_BROWSENODEFACTORYIMPL_HXX
+
 #include rtl/ustring.hxx
 #include cppuhelper/implbase2.hxx
 
@@ -77,4 +80,6 @@ public:
 
 } // namespace browsenodefactory
 
+#endif // INCLUDED_SCRIPTING_SOURCE_PROVIDER_BROWSENODEFACTORYIMPL_HXX
+
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/scripting/source/provider/MasterScriptProviderFactory.hxx 
b/scripting/source/provider/MasterScriptProviderFactory.hxx
index 462a830..a732a84 100644
--- a/scripting/source/provider/MasterScriptProviderFactory.hxx
+++ b/scripting/source/provider/MasterScriptProviderFactory.hxx
@@ -16,6 +16,10 @@
  *   except in compliance with the License. You may obtain a copy of
  *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
  */
+
+#ifndef INCLUDED_SCRIPTING_SOURCE_PROVIDER_MASTERSCRIPTPROVIDERFACTORY_HXX
+#define