sw/source/filter/md/swmd.cxx | 98 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 3 deletions(-)
New commits: commit d28d6d7ec71a119a71a975e35393f1857450b7b6 Author: Ujjawal Kumar <[email protected]> AuthorDate: Wed Feb 11 17:05:25 2026 +0530 Commit: Thorsten Behrens <[email protected]> CommitDate: Mon Feb 16 05:09:07 2026 +0100 Markdown: Handle different file encodings correctly Change-Id: I0b7d3927b105b4b77b8e750d3c368c8708d11180 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/199155 Reviewed-by: Thorsten Behrens <[email protected]> Tested-by: Jenkins diff --git a/sw/source/filter/md/swmd.cxx b/sw/source/filter/md/swmd.cxx index 58346f778538..71ca58dbc443 100644 --- a/sw/source/filter/md/swmd.cxx +++ b/sw/source/filter/md/swmd.cxx @@ -58,6 +58,9 @@ #include <fmturl.hxx> #include <formatcontentcontrol.hxx> #include <docsh.hxx> +#include <unicode/utypes.h> +#include <unicode/ucsdet.h> +#include <rtl/tencinfo.h> #include "swmd.hxx" @@ -748,7 +751,6 @@ SwMarkdownParser::SwMarkdownParser(SwDoc& rD, SwPaM& rCursor, SvStream& rIn, OUS m_nFilesize = m_rInput.TellEnd(); m_rInput.Seek(STREAM_SEEK_TO_BEGIN); m_rInput.ResetError(); - m_pArr.reset(new char[m_nFilesize + 1]); } void MarkdownReader::SetupFilterOptions(SwDoc& rDoc) @@ -840,13 +842,103 @@ ErrCodeMsg MarkdownReader::Read(SwDoc& rDoc, const OUString& rBaseURL, SwPaM& rP ErrCode SwMarkdownParser::CallParser() { + // use utf8 + rtl_TextEncoding eSrcEnc = RTL_TEXTENCODING_DONTKNOW; + m_rInput.StartReadingUnicodeText(eSrcEnc); + if (m_rInput.good()) + { + sal_uInt64 nPos = m_rInput.Tell(); //bom size + { + std::vector<char> buf(65535); // Arbitrarily chosen 64KiB buffer + const size_t nSize = m_rInput.ReadBytes(buf.data(), buf.size()); + if (nSize > 0) + { + UErrorCode uerr = U_ZERO_ERROR; + UCharsetDetector* ucd = ucsdet_open(&uerr); + ucsdet_setText(ucd, buf.data(), nSize, &uerr); + if (const UCharsetMatch* match = ucsdet_detect(ucd, &uerr)) + { + const char* pEncodingName = ucsdet_getName(match, &uerr); + + if (strcmp("UTF-16LE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + m_rInput.SetEndian(SvStreamEndian::LITTLE); + } + else if (strcmp("UTF-16BE", pEncodingName) == 0) + { + eSrcEnc = RTL_TEXTENCODING_UCS2; + m_rInput.SetEndian(SvStreamEndian::BIG); + } + else + { + eSrcEnc = rtl_getTextEncodingFromMimeCharset(pEncodingName); + } + } + ucsdet_close(ucd); + } + else + { + return ERRCODE_IO_INVALIDLENGTH; + } + } + + if (eSrcEnc == RTL_TEXTENCODING_DONTKNOW) + return ERRCODE_IO_INVALIDCHAR; + + m_rInput.Seek(nPos); + m_rInput.ResetError(); + m_nFilesize -= nPos; + + OUString sData; + OString sUtf8Data; + + if (eSrcEnc == RTL_TEXTENCODING_UCS2) + { + if (m_nFilesize & 1) + return ERRCODE_IO_INVALIDCHAR; + + tools::Long nChars = m_nFilesize / 2; + std::vector<sal_Unicode> aCharData(nChars); + + for (tools::Long n = 0; n < nChars; n++) + { + m_rInput.ReadUtf16(aCharData[n]); + } + + sData = OUString(aCharData.data(), nChars); + sUtf8Data = OUStringToOString(sData, RTL_TEXTENCODING_UTF8); + } + else + { + tools::Long nChars = m_nFilesize; + std::vector<char> aCharData(nChars); + m_rInput.ReadBytes(aCharData.data(), nChars); + sData = OUString(aCharData.data(), nChars, eSrcEnc); + sUtf8Data = OUStringToOString(sData, RTL_TEXTENCODING_UTF8); + } + + if (sUtf8Data.getLength()) + { + m_nFilesize = sUtf8Data.getLength(); + m_pArr.reset(new char[m_nFilesize]); + memcpy(m_pArr.get(), sUtf8Data.getStr(), m_nFilesize); + } + else + { + return ERRCODE_IO_INVALIDCHAR; + } + } + else + { + return ERRCODE_IO_INVALIDCHAR; + } + ::StartProgress(STR_STATSTR_W4WREAD, 0, m_nFilesize, m_xDoc->GetDocShell()); SwTextFormatColl* pColl = m_xDoc->getIDocumentStylePoolAccess().GetTextCollFromPool(SwPoolFormatId::COLL_TEXT); m_xDoc->SetTextFormatColl(*m_pPam, pColl); - m_rInput.ReadBytes(m_pArr.get(), m_nFilesize); - m_pArr[m_nFilesize] = ' ErrCode nRet;
