Script 'mail_helper' called by obssrc Hello community, here is the log from the commit of package sdcv for openSUSE:Factory checked in at 2023-04-14 13:13:14 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/sdcv (Old) and /work/SRC/openSUSE:Factory/.sdcv.new.19717 (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "sdcv" Fri Apr 14 13:13:14 2023 rev:5 rq:1079263 version:0.5.4 Changes: -------- --- /work/SRC/openSUSE:Factory/sdcv/sdcv.changes 2021-05-10 15:41:20.993093314 +0200 +++ /work/SRC/openSUSE:Factory/.sdcv.new.19717/sdcv.changes 2023-04-14 13:13:28.867702139 +0200 @@ -1,0 +2,11 @@ +Sun Dec 4 12:10:41 UTC 2022 - Dirk Müller <[email protected]> + +- update to 0.5.4: + * Use binary search for synonyms + * Various improvments in work with synonyms + * Added --json (same as --json-output) to match man + * Show all matched result + * More robust parsing of ifo file + * Prevent crash if file size of files not matched expecting one for .oft files + +------------------------------------------------------------------- Old: ---- sdcv-0.5.3.tar.gz New: ---- sdcv-0.5.4.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ sdcv.spec ++++++ --- /var/tmp/diff_new_pack.qdshJm/_old 2023-04-14 13:13:29.339704838 +0200 +++ /var/tmp/diff_new_pack.qdshJm/_new 2023-04-14 13:13:29.343704861 +0200 @@ -1,7 +1,7 @@ # # spec file for package sdcv # -# Copyright (c) 2021 SUSE LLC +# Copyright (c) 2022 SUSE LLC # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -17,7 +17,7 @@ Name: sdcv -Version: 0.5.3 +Version: 0.5.4 Release: 0 Summary: Console version of the Stardict program License: GPL-2.0-only ++++++ sdcv-0.5.3.tar.gz -> sdcv-0.5.4.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/.clang-format new/sdcv-0.5.4/.clang-format --- old/sdcv-0.5.3/.clang-format 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/.clang-format 2022-06-24 20:33:33.000000000 +0200 @@ -15,7 +15,7 @@ BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: true BinPackParameters: true -ColumnLimit: 0 +ColumnLimit: 120 ConstructorInitializerAllOnOneLineOrOnePerLine: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/.github/workflows/main.yml new/sdcv-0.5.4/.github/workflows/main.yml --- old/sdcv-0.5.3/.github/workflows/main.yml 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/.github/workflows/main.yml 2022-06-24 20:33:33.000000000 +0200 @@ -25,7 +25,7 @@ - uses: actions/checkout@v2 with: submodules: 'recursive' - - uses: jwlawson/[email protected] + - uses: jwlawson/[email protected] with: cmake-version: '3.5.1' github-api-token: ${{ secrets.GITHUB_TOKEN }} diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/CMakeLists.txt new/sdcv-0.5.4/CMakeLists.txt --- old/sdcv-0.5.3/CMakeLists.txt 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/CMakeLists.txt 2022-06-24 20:33:33.000000000 +0200 @@ -3,6 +3,10 @@ cmake_minimum_required(VERSION 3.5 FATAL_ERROR) cmake_policy(VERSION 3.5) +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_CXX_EXTENSIONS False) + include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/compiler.cmake") set(ZLIB_FIND_REQUIRED True) @@ -91,7 +95,7 @@ set(CPACK_PACKAGE_DESCRIPTION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/README.org") set(CPACK_PACKAGE_VERSION_MAJOR "0") set(CPACK_PACKAGE_VERSION_MINOR "5") -set(CPACK_PACKAGE_VERSION_PATCH "3") +set(CPACK_PACKAGE_VERSION_PATCH "4") set(sdcv_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}") @@ -143,5 +147,7 @@ add_sdcv_shell_test(t_utf8input) add_sdcv_shell_test(t_datadir) add_sdcv_shell_test(t_return_code) + add_sdcv_shell_test(t_multiple_results) + add_sdcv_shell_test(t_newlines_in_ifo) endif (BUILD_TESTS) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/cmake/compiler.cmake new/sdcv-0.5.4/cmake/compiler.cmake --- old/sdcv-0.5.3/cmake/compiler.cmake 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/cmake/compiler.cmake 2022-06-24 20:33:33.000000000 +0200 @@ -16,19 +16,6 @@ endif() endif() -if (MSVC AND (MSVC_VERSION LESS 1900)) - message(FATAL_ERROR "MSVC version ${MSVC_VERSION} have no full c++11 support") -elseif (MSVC) - add_definitions(-DNOMINMAX) -elseif (NOT MSVC) - check_cxx_compiler_flag("-std=c++11" CXX_SUPPORTS_CXX11) - if (CXX_SUPPORTS_CXX11) - append("-std=c++11" CMAKE_CXX_FLAGS) - else () - message(FATAL_ERROR "sdcv requires C++11 support but the '-std=c++11' flag isn't supported.") - endif() -endif () - if (SDCV_COMPILER_IS_GCC_COMPATIBLE) append("-Wall" "-Wextra" "-Wformat-security" "-Wcast-align" "-Werror=format" "-Wcast-qual" CMAKE_C_FLAGS) append("-Wall" "-pedantic" "-Wextra" "-Wformat-security" "-Wcast-align" "-Werror=format" "-Wcast-qual" CMAKE_CXX_FLAGS) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/src/libwrapper.cpp new/sdcv-0.5.4/src/libwrapper.cpp --- old/sdcv-0.5.3/src/libwrapper.cpp 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/src/libwrapper.cpp 2022-06-24 20:33:33.000000000 +0200 @@ -199,14 +199,18 @@ void Library::SimpleLookup(const std::string &str, TSearchResultList &res_list) { - glong ind; + std::set<glong> wordIdxs; res_list.reserve(ndicts()); - for (gint idict = 0; idict < ndicts(); ++idict) - if (SimpleLookupWord(str.c_str(), ind, idict)) - res_list.push_back( - TSearchResult(dict_name(idict), - poGetWord(ind, idict), - parse_data(poGetWordData(ind, idict), colorize_output_))); + for (gint idict = 0; idict < ndicts(); ++idict) { + wordIdxs.clear(); + if (SimpleLookupWord(str.c_str(), wordIdxs, idict)) + for (auto &wordIdx : wordIdxs) + res_list.push_back( + TSearchResult(dict_name(idict), + poGetWord(wordIdx, idict), + parse_data(poGetWordData(wordIdx, idict), + colorize_output_))); + } } void Library::LookupWithFuzzy(const std::string &str, TSearchResultList &res_list) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/src/mapfile.hpp new/sdcv-0.5.4/src/mapfile.hpp --- old/sdcv-0.5.3/src/mapfile.hpp 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/src/mapfile.hpp 2022-06-24 20:33:33.000000000 +0200 @@ -7,6 +7,7 @@ #ifdef HAVE_MMAP #include <fcntl.h> #include <sys/mman.h> +#include <sys/stat.h> #include <sys/types.h> #endif #ifdef _WIN32 @@ -40,20 +41,25 @@ size = file_size; #ifdef HAVE_MMAP if ((mmap_fd = ::open(file_name, O_RDONLY)) < 0) { - //g_print("Open file %s failed!\n",fullfilename); + // g_print("Open file %s failed!\n",fullfilename); return false; } + struct stat st; + if (fstat(mmap_fd, &st) == -1 || st.st_size < 0 || (st.st_size == 0 && S_ISREG(st.st_mode)) + || sizeof(st.st_size) > sizeof(file_size) || static_cast<unsigned long>(st.st_size) != file_size) { + close(mmap_fd); + return false; + } + data = (gchar *)mmap(nullptr, file_size, PROT_READ, MAP_SHARED, mmap_fd, 0); if ((void *)data == (void *)(-1)) { - //g_print("mmap file %s failed!\n",idxfilename); + // g_print("mmap file %s failed!\n",idxfilename); data = nullptr; return false; } #elif defined(_WIN32) - hFile = CreateFile(file_name, GENERIC_READ, 0, nullptr, OPEN_ALWAYS, - FILE_ATTRIBUTE_NORMAL, 0); - hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, - file_size, nullptr); + hFile = CreateFile(file_name, GENERIC_READ, 0, nullptr, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0); + hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, file_size, nullptr); data = (gchar *)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, file_size); #else gsize read_len; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/src/sdcv.cpp new/sdcv-0.5.4/src/sdcv.cpp --- old/sdcv-0.5.3/src/sdcv.cpp 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/src/sdcv.cpp 2022-06-24 20:33:33.000000000 +0200 @@ -83,6 +83,7 @@ glib::CharStr opt_data_dir; gboolean only_data_dir = FALSE; gboolean colorize = FALSE; + glib::StrArr word_list; const GOptionEntry entries[] = { { "version", 'v', 0, G_OPTION_ARG_NONE, &show_version, @@ -96,6 +97,8 @@ _("for use in scripts"), nullptr }, { "json-output", 'j', 0, G_OPTION_ARG_NONE, &json_output, _("print the result formatted as JSON"), nullptr }, + { "json", 'j', 0, G_OPTION_ARG_NONE, &json_output, + _("print the result formatted as JSON"), nullptr }, { "exact-search", 'e', 0, G_OPTION_ARG_NONE, &no_fuzzy, _("do not fuzzy-search for similar words, only return exact matches"), nullptr }, { "utf8-output", '0', 0, G_OPTION_ARG_NONE, &utf8_output, @@ -109,11 +112,13 @@ _("only use the dictionaries in data-dir, do not search in user and system directories"), nullptr }, { "color", 'c', 0, G_OPTION_ARG_NONE, &colorize, _("colorize the output"), nullptr }, + { G_OPTION_REMAINING, 0, 0, G_OPTION_ARG_FILENAME_ARRAY, get_addr(word_list), + _("search terms"), _(" words") }, {}, }; glib::Error error; - GOptionContext *context = g_option_context_new(_(" words")); + GOptionContext *context = g_option_context_new(nullptr); g_option_context_set_help_enabled(context, TRUE); g_option_context_add_main_entries(context, entries, nullptr); const gboolean parse_res = g_option_context_parse(context, &argc, &argv, get_addr(error)); @@ -210,14 +215,19 @@ lib.load(dicts_dir_list, order_list, disable_list); std::unique_ptr<IReadLine> io(create_readline_object()); - if (optind < argc) { + if (word_list != nullptr) { search_result rval = SEARCH_SUCCESS; - for (int i = optind; i < argc; ++i) - if ((rval = lib.process_phrase(argv[i], *io, non_interactive)) != SEARCH_SUCCESS) { - return rval; - } + gchar **p = get_impl(word_list); + while (*p) { + search_result this_rval = lib.process_phrase(*p++, *io, non_interactive); + // If we encounter any error, save it but continue through the word + // list to check all requested words. + if (rval == SEARCH_SUCCESS) + rval = this_rval; + } + if (rval != SEARCH_SUCCESS) + return rval; } else if (!non_interactive) { - std::string phrase; while (io->read(_("Enter word or phrase: "), phrase)) { if (lib.process_phrase(phrase.c_str(), *io) == SEARCH_FAILURE) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/src/stardict_lib.cpp new/sdcv-0.5.4/src/stardict_lib.cpp --- old/sdcv-0.5.3/src/stardict_lib.cpp 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/src/stardict_lib.cpp 2022-06-24 20:33:33.000000000 +0200 @@ -5,6 +5,7 @@ #include <algorithm> #include <cctype> #include <cstring> +#include <map> #include <stdexcept> #include <glib/gstdio.h> @@ -47,9 +48,9 @@ { // i think this should work even when it is UTF8 string :). for (int i = 0; str[i] != 0; i++) - //if(str[i]<0) - //if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK. - // Better use isascii() but not str[i]<0 while char is default unsigned in arm + // if(str[i]<0) + // if(str[i]<32 || str[i]>126) // tab equal 9,so this is not OK. + // Better use isascii() but not str[i]<0 while char is default unsigned in arm if (!isascii(str[i])) return false; return true; @@ -78,108 +79,93 @@ { ifo_file_name = ifofilename; glib::CharStr buffer; - if (!g_file_get_contents(ifofilename.c_str(), get_addr(buffer), nullptr, nullptr)) + gsize length = 0; + if (!g_file_get_contents(ifofilename.c_str(), get_addr(buffer), &length, nullptr)) { + fprintf(stderr, "Can not read from %s\n", ifofilename.c_str()); return false; + } static const char TREEDICT_MAGIC_DATA[] = "StarDict's treedict ifo file"; static const char DICT_MAGIC_DATA[] = "StarDict's dict ifo file"; const gchar *magic_data = istreedict ? TREEDICT_MAGIC_DATA : DICT_MAGIC_DATA; - static const unsigned char utf8_bom[] = { 0xEF, 0xBB, 0xBF, '\0' }; - if (!g_str_has_prefix( - g_str_has_prefix(get_impl(buffer), (const gchar *)(utf8_bom)) ? get_impl(buffer) + 3 : get_impl(buffer), - magic_data)) { - return false; - } + static const gchar utf8_bom[] = { (gchar)0xEF, (gchar)0xBB, (gchar)0xBF, '\0' }; - gchar *p1 = get_impl(buffer) + strlen(magic_data) - 1; - - gchar *p2 = strstr(p1, "\nwordcount="); - if (p2 == nullptr) - return false; + const gchar *p = get_impl(buffer); + const gchar *end = p + length; - gchar *p3 = strchr(p2 + sizeof("\nwordcount=") - 1, '\n'); - - wordcount = atol(std::string(p2 + sizeof("\nwordcount=") - 1, p3 - (p2 + sizeof("\nwordcount=") - 1)).c_str()); - - if (istreedict) { - p2 = strstr(p1, "\ntdxfilesize="); - if (p2 == nullptr) - return false; - - p3 = strchr(p2 + sizeof("\ntdxfilesize=") - 1, '\n'); - - index_file_size = atol(std::string(p2 + sizeof("\ntdxfilesize=") - 1, p3 - (p2 + sizeof("\ntdxfilesize=") - 1)).c_str()); - - } else { - - p2 = strstr(p1, "\nidxfilesize="); - if (p2 == nullptr) - return false; - - p3 = strchr(p2 + sizeof("\nidxfilesize=") - 1, '\n'); - index_file_size = atol(std::string(p2 + sizeof("\nidxfilesize=") - 1, p3 - (p2 + sizeof("\nidxfilesize=") - 1)).c_str()); + if (g_str_has_prefix(p, utf8_bom)) { + p += strlen(utf8_bom); } - - p2 = strstr(p1, "\nbookname="); - - if (p2 == nullptr) + if (!g_str_has_prefix(p, magic_data)) { + fprintf(stderr, "No magic header(%s) in ifo file\n", magic_data); return false; - - p2 = p2 + sizeof("\nbookname=") - 1; - p3 = strchr(p2, '\n'); - bookname.assign(p2, p3 - p2); - - p2 = strstr(p1, "\nauthor="); - if (p2) { - p2 = p2 + sizeof("\nauthor=") - 1; - p3 = strchr(p2, '\n'); - author.assign(p2, p3 - p2); } + p += strlen(magic_data); - p2 = strstr(p1, "\nemail="); - if (p2) { - p2 = p2 + sizeof("\nemail=") - 1; - p3 = strchr(p2, '\n'); - email.assign(p2, p3 - p2); - } + std::map<std::string, std::string> key_value_map; + while (p != end) { + auto key_it = std::find_if(p, end, [](gchar ch) { return !g_ascii_isspace(ch); }); + if (key_it == end) { + break; + } + auto eq_it = std::find(key_it, end, gchar('=')); + if (eq_it == end) { + fprintf(stderr, "Invalid part of ifo (no '=') here: %s\n", key_it); + return false; + } + auto val_it = std::find_if(eq_it + 1, end, [](gchar ch) { return !g_ascii_isspace(ch); }); + if (val_it == end) { + key_value_map.insert(std::make_pair(std::string(key_it, eq_it), std::string())); + break; + } - p2 = strstr(p1, "\nwebsite="); - if (p2) { - p2 = p2 + sizeof("\nwebsite=") - 1; - p3 = strchr(p2, '\n'); - website.assign(p2, p3 - p2); + auto line_end_it = std::find_if(val_it, end, [](gchar ch) { return ch == '\r' || ch == '\n'; }); + key_value_map.insert(std::make_pair(std::string(key_it, eq_it), std::string(val_it, line_end_it))); + if (line_end_it == end) + break; + p = line_end_it + 1; } - p2 = strstr(p1, "\ndate="); - if (p2) { - p2 = p2 + sizeof("\ndate=") - 1; - p3 = strchr(p2, '\n'); - date.assign(p2, p3 - p2); + std::map<std::string, std::string>::const_iterator it; +#define FIND_KEY(_key_) \ + it = key_value_map.find(_key_); \ + if (it == key_value_map.end()) { \ + fprintf(stderr, "Can not find '%s' in ifo file\n", _key_); \ + return false; \ } - p2 = strstr(p1, "\ndescription="); - if (p2) { - p2 = p2 + sizeof("\ndescription=") - 1; - p3 = strchr(p2, '\n'); - description.assign(p2, p3 - p2); - } + FIND_KEY("wordcount") + wordcount = atol(it->second.c_str()); - p2 = strstr(p1, "\nsametypesequence="); - if (p2) { - p2 += sizeof("\nsametypesequence=") - 1; - p3 = strchr(p2, '\n'); - sametypesequence.assign(p2, p3 - p2); + if (istreedict) { + FIND_KEY("tdxfilesize") + index_file_size = atol(it->second.c_str()); + } else { + FIND_KEY("idxfilesize") + index_file_size = atol(it->second.c_str()); } + FIND_KEY("bookname") + bookname = it->second; - p2 = strstr(p1, "\nsynwordcount="); +#define SET_IF_EXISTS(_key_) \ + it = key_value_map.find(#_key_); \ + if (it != key_value_map.end()) { \ + _key_ = it->second; \ + } + + SET_IF_EXISTS(author) + SET_IF_EXISTS(email) + SET_IF_EXISTS(website) + SET_IF_EXISTS(date) + SET_IF_EXISTS(description) + SET_IF_EXISTS(sametypesequence) syn_wordcount = 0; - if (p2) { - p2 += sizeof("\nsynwordcount=") - 1; - p3 = strchr(p2, '\n'); - syn_wordcount = atol(std::string(p2, p3 - p2).c_str()); - } - + it = key_value_map.find("synwordcount"); + if (it != key_value_map.end()) + syn_wordcount = atol(it->second.c_str()); +#undef FIND_KEY +#undef SET_IF_EXISTS return true; } @@ -204,10 +190,10 @@ guint32 data_size; gint sametypesequence_len = sametypesequence.length(); - //there have sametypesequence_len char being omitted. + // there have sametypesequence_len char being omitted. data_size = idxitem_size + sizeof(guint32) + sametypesequence_len; - //if the last item's size is determined by the end up '\0',then +=sizeof(gchar); - //if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32); + // if the last item's size is determined by the end up '\0',then +=sizeof(gchar); + // if the last item's size is determined by the head guint32 type data,then +=sizeof(guint32); switch (sametypesequence[sametypesequence_len - 1]) { case 'm': case 't': @@ -234,7 +220,7 @@ p1 = data + sizeof(guint32); p2 = get_impl(origin_data); guint32 sec_size; - //copy the head items. + // copy the head items. for (int i = 0; i < sametypesequence_len - 1; i++) { *p1 = sametypesequence[i]; p1 += sizeof(gchar); @@ -272,7 +258,7 @@ break; } } - //calculate the last item 's size. + // calculate the last item 's size. sec_size = idxitem_size - (p2 - get_impl(origin_data)); *p1 = sametypesequence[sametypesequence_len - 1]; p1 += sizeof(gchar); @@ -286,7 +272,7 @@ case 'k': memcpy(p1, p2, sec_size); p1 += sec_size; - *p1 = '\0'; //add the end up '\0'; + *p1 = '\0'; // add the end up '\0'; break; case 'W': case 'P': @@ -450,7 +436,7 @@ { return get_key(idx); } - bool lookup(const char *str, glong &idx) override; + bool lookup(const char *str, std::set<glong> &idxs, glong &next_idx) override; private: static const gint ENTR_PER_PAGE = 32; @@ -511,7 +497,7 @@ get_data(idx); return get_key(idx); } - bool lookup(const char *str, glong &idx) override; + bool lookup(const char *str, std::set<glong> &idxs, glong &next_idx) override; private: gchar *idxdatabuf; @@ -542,7 +528,7 @@ std::min(sizeof(wordentry_buf), static_cast<size_t>(page_size)), 1, idxfile); THROW_IF_ERROR(nitems == 1); - //TODO: check returned values, deal with word entry that strlen>255. + // TODO: check returned values, deal with word entry that strlen>255. return wordentry_buf; } @@ -634,7 +620,7 @@ wordcount = wc; gulong npages = (wc - 1) / ENTR_PER_PAGE + 2; wordoffset.resize(npages); - if (!load_cache(url)) { //map file will close after finish of block + if (!load_cache(url)) { // map file will close after finish of block MapFile map_file; if (!map_file.open(url.c_str(), fsize)) return false; @@ -698,47 +684,52 @@ return page.entries[idx_in_page].keystr; } -bool OffsetIndex::lookup(const char *str, glong &idx) +bool OffsetIndex::lookup(const char *str, std::set<glong> &idxs, glong &next_idx) { bool bFound = false; - glong iFrom; - glong iTo = wordoffset.size() - 2; - gint cmpint; - glong iThisIndex; + if (stardict_strcmp(str, first.keystr.c_str()) < 0) { - idx = 0; + next_idx = 0; return false; } else if (stardict_strcmp(str, real_last.keystr.c_str()) > 0) { - idx = INVALID_INDEX; + next_idx = INVALID_INDEX; return false; - } else { - iFrom = 0; - iThisIndex = 0; - while (iFrom <= iTo) { - iThisIndex = (iFrom + iTo) / 2; - cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex)); - if (cmpint > 0) - iFrom = iThisIndex + 1; - else if (cmpint < 0) - iTo = iThisIndex - 1; - else { - bFound = true; - break; - } + } + + // Search for the first page where the word is likely to be located. + glong iFrom = 0, iTo = wordoffset.size() - 2; + glong iPage = 0, iThisIndex = 0; + while (iFrom <= iTo) { + iThisIndex = (iFrom + iTo) / 2; + glong cmpint = stardict_strcmp(str, get_first_on_page_key(iThisIndex)); + if (cmpint > 0) + iFrom = iThisIndex + 1; + else if (cmpint < 0) + iTo = iThisIndex - 1; + else { + bFound = true; + break; } - if (!bFound) - idx = iTo; //prev - else - idx = iThisIndex; } - if (!bFound) { - gulong netr = load_page(idx); - iFrom = 1; // Needn't search the first word anymore. + + if (bFound) { + // We can use this found index (even though it might not be the first) + // because we will search backwards later and catch any entries on + // previous pages. + iPage = iThisIndex; + iThisIndex = 0; // first item in the page + } else { + iPage = iTo; // prev + // Not found at the start of a page, so search within the page that + // should contain it. Binary search here is slightly overkill (we're + // searching at most ENTR_PER_PAGE = 32 elements) but this way next_idx + // is treated the same as other Lookup methods. + gulong netr = load_page(iPage); + iFrom = 0; iTo = netr - 1; - iThisIndex = 0; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; - cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr); + glong cmpint = stardict_strcmp(str, page.entries[iThisIndex].keystr); if (cmpint > 0) iFrom = iThisIndex + 1; else if (cmpint < 0) @@ -748,13 +739,21 @@ break; } } - idx *= ENTR_PER_PAGE; - if (!bFound) - idx += iFrom; //next - else - idx += iThisIndex; - } else { - idx *= ENTR_PER_PAGE; + } + + if (!bFound) + next_idx = iPage * ENTR_PER_PAGE + iFrom; // next + else { + // Convert the found in-page index to the dict index. + iThisIndex = iPage * ENTR_PER_PAGE + iThisIndex; + // In order to return all idxs that match the search string, walk + // linearly behind and ahead of the found index. + glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex + while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) + idxs.insert(iHeadIndex--); + do // no need to double-check iThisIndex -- we know it's a match already + idxs.insert(iThisIndex++); + while (iThisIndex <= real_last.idx && stardict_strcmp(str, get_key(iThisIndex)) == 0); } return bFound; } @@ -795,18 +794,18 @@ wordentry_size = g_ntohl(get_uint32(p1)); } -bool WordListIndex::lookup(const char *str, glong &idx) +bool WordListIndex::lookup(const char *str, std::set<glong> &idxs, glong &next_idx) { bool bFound = false; - glong iTo = wordlist.size() - 2; + glong iLast = wordlist.size() - 2; if (stardict_strcmp(str, get_key(0)) < 0) { - idx = 0; - } else if (stardict_strcmp(str, get_key(iTo)) > 0) { - idx = INVALID_INDEX; + next_idx = 0; + } else if (stardict_strcmp(str, get_key(iLast)) > 0) { + next_idx = INVALID_INDEX; } else { glong iThisIndex = 0; - glong iFrom = 0; + glong iFrom = 0, iTo = iLast; gint cmpint; while (iFrom <= iTo) { iThisIndex = (iFrom + iTo) / 2; @@ -821,9 +820,17 @@ } } if (!bFound) - idx = iFrom; //next - else - idx = iThisIndex; + next_idx = iFrom; // next + else { + // In order to return all idxs that match the search string, walk + // linearly behind and ahead of the found index. + glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex + while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) + idxs.insert(iHeadIndex--); + do // no need to double-check iThisIndex -- we know it's a match already + idxs.insert(iThisIndex++); + while (iThisIndex <= iLast && stardict_strcmp(str, get_key(iThisIndex)) == 0); + } } return bFound; } @@ -833,41 +840,82 @@ { struct stat stat_buf; if (!stat(url.c_str(), &stat_buf)) { - MapFile syn; - if (!syn.open(url.c_str(), stat_buf.st_size)) + + if (!synfile.open(url.c_str(), stat_buf.st_size)) return false; - const gchar *current = syn.begin(); + + synlist.resize(wc + 1); + gchar *p1 = synfile.begin(); + for (unsigned long i = 0; i < wc; i++) { // each entry in a syn-file is: // - 0-terminated string // 4-byte index into .dict file in network byte order - glib::CharStr lower_string{ g_utf8_casefold(current, -1) }; - std::string synonym{ get_impl(lower_string) }; - current += synonym.length() + 1; - const guint32 idx = g_ntohl(get_uint32(current)); - current += sizeof(idx); - synonyms[synonym] = idx; + + synlist[i] = p1; + p1 += strlen(p1) + 1 + 4; } + synlist[wc] = p1; + return true; } else { return false; } } -bool SynFile::lookup(const char *str, glong &idx) +bool SynFile::lookup(const char *str, std::set<glong> &idxs, glong &next_idx) { - glib::CharStr lower_string{ g_utf8_casefold(str, -1) }; - auto it = synonyms.find(get_impl(lower_string)); - if (it != synonyms.end()) { - idx = it->second; - return true; + bool bFound = false; + glong iLast = synlist.size() - 2; + if (iLast < 0) + return false; + + if (stardict_strcmp(str, get_key(0)) < 0) { + next_idx = 0; + } else if (stardict_strcmp(str, get_key(iLast)) > 0) { + next_idx = INVALID_INDEX; + } else { + glong iThisIndex = 0; + glong iFrom = 0, iTo = iLast; + gint cmpint; + while (iFrom <= iTo) { + iThisIndex = (iFrom + iTo) / 2; + cmpint = stardict_strcmp(str, get_key(iThisIndex)); + if (cmpint > 0) + iFrom = iThisIndex + 1; + else if (cmpint < 0) + iTo = iThisIndex - 1; + else { + bFound = true; + break; + } + } + if (!bFound) + next_idx = iFrom; // next + else { + // In order to return all idxs that match the search string, walk + // linearly behind and ahead of the found index. + glong iHeadIndex = iThisIndex - 1; // do not include iThisIndex + while (iHeadIndex >= 0 && stardict_strcmp(str, get_key(iHeadIndex)) == 0) { + const gchar *key = get_key(iHeadIndex--); + idxs.insert(g_ntohl(get_uint32(key + strlen(key) + 1))); + } + do { + // no need to double-check iThisIndex -- we know it's a match already + const gchar *key = get_key(iThisIndex++); + idxs.insert(g_ntohl(get_uint32(key + strlen(key) + 1))); + } while (iThisIndex <= iLast && stardict_strcmp(str, get_key(iThisIndex)) == 0); + } } - return false; + return bFound; } -bool Dict::Lookup(const char *str, glong &idx) +bool Dict::Lookup(const char *str, std::set<glong> &idxs, glong &next_idx) { - return syn_file->lookup(str, idx) || idx_file->lookup(str, idx); + bool found = false; + found |= syn_file->lookup(str, idxs, next_idx); + found |= idx_file->lookup(str, idxs, next_idx); + return found; } bool Dict::load(const std::string &ifofilename, bool verbose) @@ -882,14 +930,14 @@ if (g_file_test(fullfilename.c_str(), G_FILE_TEST_EXISTS)) { dictdzfile.reset(new DictData); if (!dictdzfile->open(fullfilename, 0)) { - //g_print("open file %s failed!\n",fullfilename); + // g_print("open file %s failed!\n",fullfilename); return false; } } else { fullfilename.erase(fullfilename.length() - sizeof(".dz") + 1, sizeof(".dz") - 1); dictfile = fopen(fullfilename.c_str(), "rb"); if (!dictfile) { - //g_print("open file %s failed!\n",fullfilename); + // g_print("open file %s failed!\n",fullfilename); return false; } } @@ -912,7 +960,7 @@ syn_file.reset(new SynFile); syn_file->load(fullfilename, syn_wordcount); - //g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles()); + // g_print("bookname: %s , wordcount %lu\n", bookname.c_str(), narticles()); return true; } @@ -975,120 +1023,8 @@ }); } -const gchar *Libs::poGetCurrentWord(glong *iCurrent) +bool Libs::LookupSimilarWord(const gchar *sWord, std::set<glong> &iWordIndices, int iLib) { - const gchar *poCurrentWord = nullptr; - const gchar *word; - for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) { - if (iCurrent[iLib] == INVALID_INDEX) - continue; - if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0) - continue; - if (poCurrentWord == nullptr) { - poCurrentWord = poGetWord(iCurrent[iLib], iLib); - } else { - word = poGetWord(iCurrent[iLib], iLib); - - if (stardict_strcmp(poCurrentWord, word) > 0) - poCurrentWord = word; - } - } - return poCurrentWord; -} - -const gchar *Libs::poGetNextWord(const gchar *sWord, glong *iCurrent) -{ - // the input can be: - // (word,iCurrent),read word,write iNext to iCurrent,and return next word. used by TopWin::NextCallback(); - // (nullptr,iCurrent),read iCurrent,write iNext to iCurrent,and return next word. used by AppCore::ListWords(); - const gchar *poCurrentWord = nullptr; - size_t iCurrentLib = 0; - const gchar *word; - - for (size_t iLib = 0; iLib < oLib.size(); ++iLib) { - if (sWord) - oLib[iLib]->Lookup(sWord, iCurrent[iLib]); - if (iCurrent[iLib] == INVALID_INDEX) - continue; - if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0) - continue; - if (poCurrentWord == nullptr) { - poCurrentWord = poGetWord(iCurrent[iLib], iLib); - iCurrentLib = iLib; - } else { - word = poGetWord(iCurrent[iLib], iLib); - - if (stardict_strcmp(poCurrentWord, word) > 0) { - poCurrentWord = word; - iCurrentLib = iLib; - } - } - } - if (poCurrentWord) { - iCurrent[iCurrentLib]++; - for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) { - if (iLib == iCurrentLib) - continue; - if (iCurrent[iLib] == INVALID_INDEX) - continue; - if (iCurrent[iLib] >= narticles(iLib) || iCurrent[iLib] < 0) - continue; - if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib], iLib)) == 0) - iCurrent[iLib]++; - } - poCurrentWord = poGetCurrentWord(iCurrent); - } - return poCurrentWord; -} - -const gchar * -Libs::poGetPreWord(glong *iCurrent) -{ - // used by TopWin::PreviousCallback(); the iCurrent is cached by AppCore::TopWinWordChange(); - const gchar *poCurrentWord = nullptr; - std::vector<Dict *>::size_type iCurrentLib = 0; - const gchar *word; - - for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) { - if (iCurrent[iLib] == INVALID_INDEX) - iCurrent[iLib] = narticles(iLib); - else { - if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0) - continue; - } - if (poCurrentWord == nullptr) { - poCurrentWord = poGetWord(iCurrent[iLib] - 1, iLib); - iCurrentLib = iLib; - } else { - word = poGetWord(iCurrent[iLib] - 1, iLib); - if (stardict_strcmp(poCurrentWord, word) < 0) { - poCurrentWord = word; - iCurrentLib = iLib; - } - } - } - - if (poCurrentWord) { - iCurrent[iCurrentLib]--; - for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) { - if (iLib == iCurrentLib) - continue; - if (iCurrent[iLib] > narticles(iLib) || iCurrent[iLib] <= 0) - continue; - if (strcmp(poCurrentWord, poGetWord(iCurrent[iLib] - 1, iLib)) == 0) { - iCurrent[iLib]--; - } else { - if (iCurrent[iLib] == narticles(iLib)) - iCurrent[iLib] = INVALID_INDEX; - } - } - } - return poCurrentWord; -} - -bool Libs::LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib) -{ - glong iIndex; bool bFound = false; gchar *casestr; @@ -1096,7 +1032,7 @@ // to lower case. casestr = g_utf8_strdown(sWord, -1); if (strcmp(casestr, sWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1104,7 +1040,7 @@ if (!bFound) { casestr = g_utf8_strup(sWord, -1); if (strcmp(casestr, sWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1118,7 +1054,7 @@ g_free(firstchar); g_free(nextchar); if (strcmp(casestr, sWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1132,18 +1068,18 @@ gchar *sNewWord = (gchar *)g_malloc(iWordLen + 1); - //cut one char "s" or "d" + // cut one char "s" or "d" if (!bFound && iWordLen > 1) { isupcase = sWord[iWordLen - 1] == 'S' || !strncmp(&sWord[iWordLen - 2], "ED", 2); if (isupcase || sWord[iWordLen - 1] == 's' || !strncmp(&sWord[iWordLen - 2], "ed", 2)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 1] = '\0'; // cut "s" or "d" - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1151,38 +1087,38 @@ } } - //cut "ly" + // cut "ly" if (!bFound && iWordLen > 2) { isupcase = !strncmp(&sWord[iWordLen - 2], "LY", 2); if (isupcase || (!strncmp(&sWord[iWordLen - 2], "ly", 2))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; // cut "ly" if (iWordLen > 5 && sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4] - && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled + && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { // doubled sNewWord[iWordLen - 3] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) - sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore + sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; // restore } } if (!bFound) { - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1191,37 +1127,37 @@ } } - //cut "ing" + // cut "ing" if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 3], "ING", 3); if (isupcase || !strncmp(&sWord[iWordLen - 3], "ing", 3)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; if (iWordLen > 6 && (sNewWord[iWordLen - 4] == sNewWord[iWordLen - 5]) - && !bIsVowel(sNewWord[iWordLen - 5]) && bIsVowel(sNewWord[iWordLen - 6])) { //doubled + && !bIsVowel(sNewWord[iWordLen - 5]) && bIsVowel(sNewWord[iWordLen - 6])) { // doubled sNewWord[iWordLen - 4] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) - sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; //restore + sNewWord[iWordLen - 4] = sNewWord[iWordLen - 5]; // restore } } if (!bFound) { - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1232,12 +1168,12 @@ strcat(sNewWord, "E"); // add a char "E" else strcat(sNewWord, "e"); // add a char "e" - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1246,18 +1182,18 @@ } } - //cut two char "es" + // cut two char "es" if (!bFound && iWordLen > 3) { isupcase = (!strncmp(&sWord[iWordLen - 2], "ES", 2) && (sWord[iWordLen - 3] == 'S' || sWord[iWordLen - 3] == 'X' || sWord[iWordLen - 3] == 'O' || (iWordLen > 4 && sWord[iWordLen - 3] == 'H' && (sWord[iWordLen - 4] == 'C' || sWord[iWordLen - 4] == 'S')))); if (isupcase || (!strncmp(&sWord[iWordLen - 2], "es", 2) && (sWord[iWordLen - 3] == 's' || sWord[iWordLen - 3] == 'x' || sWord[iWordLen - 3] == 'o' || (iWordLen > 4 && sWord[iWordLen - 3] == 'h' && (sWord[iWordLen - 4] == 'c' || sWord[iWordLen - 4] == 's'))))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1265,37 +1201,37 @@ } } - //cut "ed" + // cut "ed" if (!bFound && iWordLen > 3) { isupcase = !strncmp(&sWord[iWordLen - 2], "ED", 2); if (isupcase || !strncmp(&sWord[iWordLen - 2], "ed", 2)) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; if (iWordLen > 5 && (sNewWord[iWordLen - 3] == sNewWord[iWordLen - 4]) - && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { //doubled + && !bIsVowel(sNewWord[iWordLen - 4]) && bIsVowel(sNewWord[iWordLen - 5])) { // doubled sNewWord[iWordLen - 3] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else { if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); } if (!bFound) - sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; //restore + sNewWord[iWordLen - 3] = sNewWord[iWordLen - 4]; // restore } } if (!bFound) { - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1314,12 +1250,12 @@ strcat(sNewWord, "Y"); // add a char "Y" else strcat(sNewWord, "y"); // add a char "y" - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1337,12 +1273,12 @@ strcat(sNewWord, "Y"); // add a char "Y" else strcat(sNewWord, "y"); // add a char "y" - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1356,12 +1292,12 @@ if (isupcase || (!strncmp(&sWord[iWordLen - 2], "er", 2))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 2] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1375,12 +1311,12 @@ if (isupcase || (!strncmp(&sWord[iWordLen - 3], "est", 3))) { strcpy(sNewWord, sWord); sNewWord[iWordLen - 3] = '\0'; - if (oLib[iLib]->Lookup(sNewWord, iIndex)) + if (oLib[iLib]->Lookup(sNewWord, iWordIndices)) bFound = true; else if (isupcase || g_ascii_isupper(sWord[0])) { casestr = g_ascii_strdown(sNewWord, -1); if (strcmp(casestr, sNewWord)) { - if (oLib[iLib]->Lookup(casestr, iIndex)) + if (oLib[iLib]->Lookup(casestr, iWordIndices)) bFound = true; } g_free(casestr); @@ -1390,24 +1326,21 @@ g_free(sNewWord); } - - if (bFound) - iWordIndex = iIndex; #if 0 - else { - //don't change iWordIndex here. - //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words. - //iWordIndex = INVALID_INDEX; - } + else { + //don't change iWordIndex here. + //when LookupSimilarWord all failed too, we want to use the old LookupWord index to list words. + //iWordIndex = INVALID_INDEX; + } #endif return bFound; } -bool Libs::SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib) +bool Libs::SimpleLookupWord(const gchar *sWord, std::set<glong> &iWordIndices, int iLib) { - bool bFound = oLib[iLib]->Lookup(sWord, iWordIndex); + bool bFound = oLib[iLib]->Lookup(sWord, iWordIndices); if (!bFound && fuzzy_) - bFound = LookupSimilarWord(sWord, iWordIndex, iLib); + bFound = LookupSimilarWord(sWord, iWordIndices, iLib); return bFound; } @@ -1439,8 +1372,8 @@ if (progress_func) progress_func(); - //if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) { - //there are Chinese dicts and English dicts... + // if (stardict_strcmp(sWord, poGetWord(0,iLib))>=0 && stardict_strcmp(sWord, poGetWord(narticles(iLib)-1,iLib))<=0) { + // there are Chinese dicts and English dicts... const int iwords = narticles(iLib); for (int index = 0; index < iwords; index++) { @@ -1462,11 +1395,11 @@ bool bAlreadyInList = false; int iMaxDistanceAt = 0; for (int j = 0; j < reslist_size; j++) { - if (oFuzzystruct[j].pMatchWord && strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0) { //already in list + if (oFuzzystruct[j].pMatchWord && strcmp(oFuzzystruct[j].pMatchWord, sCheck) == 0) { // already in list bAlreadyInList = true; break; } - //find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time. + // find the position,it will certainly be found (include the first time) as iMaxDistance is set by last time. if (oFuzzystruct[j].iMatchWordDistance == iMaxDistance) { iMaxDistanceAt = j; } @@ -1513,8 +1446,8 @@ GPatternSpec *pspec = g_pattern_spec_new(word); for (std::vector<Dict *>::size_type iLib = 0; iLib < oLib.size(); iLib++) { - //if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib)) - // -iMatchCount,so save time,but may got less result and the word may repeat. + // if(oLibs.LookdupWordsWithRule(pspec,aiIndex,MAX_MATCH_ITEM_PER_LIB+1-iMatchCount,iLib)) + // -iMatchCount,so save time,but may got less result and the word may repeat. if (oLib[iLib]->LookupWithRule(pspec, aiIndex, MAX_MATCH_ITEM_PER_LIB + 1)) { if (progress_func) @@ -1523,7 +1456,7 @@ const gchar *sMatchWord = poGetWord(aiIndex[i], iLib); bool bAlreadyInList = false; for (int j = 0; j < iMatchCount; j++) { - if (strcmp(ppMatchWord[j], sMatchWord) == 0) { //already in list + if (strcmp(ppMatchWord[j], sMatchWord) == 0) { // already in list bAlreadyInList = true; break; } diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/src/stardict_lib.hpp new/sdcv-0.5.4/src/stardict_lib.hpp --- old/sdcv-0.5.3/src/stardict_lib.hpp 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/src/stardict_lib.hpp 2022-06-24 20:33:33.000000000 +0200 @@ -1,11 +1,10 @@ #pragma once -#include <cstdio> #include <cstring> #include <functional> #include <list> -#include <map> #include <memory> +#include <set> #include <string> #include <vector> @@ -29,7 +28,7 @@ struct cacheItem { guint32 offset; gchar *data; - //write code here to make it inline + // write code here to make it inline cacheItem() { data = nullptr; } ~cacheItem() { g_free(data); } }; @@ -67,7 +66,7 @@ gint cache_cur = 0; }; -//this structure contain all information about dictionary +// this structure contain all information about dictionary struct DictInfo { std::string ifo_file_name; guint32 wordcount; @@ -96,17 +95,27 @@ virtual const gchar *get_key(glong idx) = 0; virtual void get_data(glong idx) = 0; virtual const gchar *get_key_and_data(glong idx) = 0; - virtual bool lookup(const char *str, glong &idx) = 0; + virtual bool lookup(const char *str, std::set<glong> &idxs, glong &next_idx) = 0; + virtual bool lookup(const char *str, std::set<glong> &idxs) + { + glong unused_next_idx; + return lookup(str, idxs, unused_next_idx); + }; }; class SynFile { public: + SynFile() {} + ~SynFile() {} bool load(const std::string &url, gulong wc); - bool lookup(const char *str, glong &idx); + bool lookup(const char *str, std::set<glong> &idxs, glong &next_idx); + bool lookup(const char *str, std::set<glong> &idxs); + const gchar *get_key(glong idx) { return synlist[idx]; } private: - std::map<std::string, gulong> synonyms; + MapFile synfile; + std::vector<gchar *> synlist; }; class Dict : public DictBase @@ -133,7 +142,12 @@ *offset = idx_file->wordentry_offset; *size = idx_file->wordentry_size; } - bool Lookup(const char *str, glong &idx); + bool Lookup(const char *str, std::set<glong> &idxs, glong &next_idx); + bool Lookup(const char *str, std::set<glong> &idxs) + { + glong unused_next_idx; + return Lookup(str, idxs, unused_next_idx); + } bool LookupWithRule(GPatternSpec *pspec, glong *aIndex, int iBuffLen); @@ -155,7 +169,7 @@ Libs(std::function<void(void)> f = std::function<void(void)>()) { progress_func = f; - iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; //need to read from cfg. + iMaxFuzzyDistance = MAX_FUZZY_DISTANCE; // need to read from cfg. } void setVerbose(bool verbose) { verbose_ = verbose; } void setFuzzy(bool fuzzy) { fuzzy_ = fuzzy; } @@ -181,15 +195,12 @@ return nullptr; return oLib[iLib]->get_data(iIndex); } - const gchar *poGetCurrentWord(glong *iCurrent); - const gchar *poGetNextWord(const gchar *word, glong *iCurrent); - const gchar *poGetPreWord(glong *iCurrent); - bool LookupWord(const gchar *sWord, glong &iWordIndex, int iLib) + bool LookupWord(const gchar *sWord, std::set<glong> &iWordIndices, int iLib) { - return oLib[iLib]->Lookup(sWord, iWordIndex); + return oLib[iLib]->Lookup(sWord, iWordIndices); } - bool LookupSimilarWord(const gchar *sWord, glong &iWordIndex, int iLib); - bool SimpleLookupWord(const gchar *sWord, glong &iWordIndex, int iLib); + bool LookupSimilarWord(const gchar *sWord, std::set<glong> &iWordIndices, int iLib); + bool SimpleLookupWord(const gchar *sWord, std::set<glong> &iWordIndices, int iLib); bool LookupWithFuzzy(const gchar *sWord, gchar *reslist[], gint reslist_size); gint LookupWithRule(const gchar *sWord, gchar *reslist[]); diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/tests/not-unix-newlines-ifo/russian/russian.ifo new/sdcv-0.5.4/tests/not-unix-newlines-ifo/russian/russian.ifo --- old/sdcv-0.5.3/tests/not-unix-newlines-ifo/russian/russian.ifo 1970-01-01 01:00:00.000000000 +0100 +++ new/sdcv-0.5.4/tests/not-unix-newlines-ifo/russian/russian.ifo 2022-06-24 20:33:33.000000000 +0200 @@ -0,0 +1,9 @@ +StarDict's dict ifo file +version=3.0.0 +bookname=Russian-English Dictionary (ru-en) +wordcount=415144 +idxfilesize=12344255 +sametypesequence=h +synwordcount=1277580 +author=Vuizur +description= Binary files old/sdcv-0.5.3/tests/stardict-test_multiple_results-2.4.2/test.dict and new/sdcv-0.5.4/tests/stardict-test_multiple_results-2.4.2/test.dict differ Binary files old/sdcv-0.5.3/tests/stardict-test_multiple_results-2.4.2/test.idx and new/sdcv-0.5.4/tests/stardict-test_multiple_results-2.4.2/test.idx differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/tests/stardict-test_multiple_results-2.4.2/test.ifo new/sdcv-0.5.4/tests/stardict-test_multiple_results-2.4.2/test.ifo --- old/sdcv-0.5.3/tests/stardict-test_multiple_results-2.4.2/test.ifo 1970-01-01 01:00:00.000000000 +0100 +++ new/sdcv-0.5.4/tests/stardict-test_multiple_results-2.4.2/test.ifo 2022-06-24 20:33:33.000000000 +0200 @@ -0,0 +1,7 @@ +StarDict's dict ifo file +version=3.0.0 +bookname=Test multiple results +wordcount=246 +idxfilesize=5977 +synwordcount=124 +description= Binary files old/sdcv-0.5.3/tests/stardict-test_multiple_results-2.4.2/test.syn and new/sdcv-0.5.4/tests/stardict-test_multiple_results-2.4.2/test.syn differ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/tests/t_json new/sdcv-0.5.4/tests/t_json --- old/sdcv-0.5.3/tests/t_json 2020-08-14 12:06:51.000000000 +0200 +++ new/sdcv-0.5.4/tests/t_json 2022-06-24 20:33:33.000000000 +0200 @@ -18,8 +18,15 @@ fi } -test_json '[{"name": "Test synonyms", "wordcount": "2"},{"name": "Sample 1 test dictionary", "wordcount": "1"},{"name": "test_dict", "wordcount": "1"}]' -x -j -l -n --data-dir "$TEST_DIR" +test_json '[{"name": "Russian-English Dictionary (ru-en)", "wordcount": "415144"}, + {"name": "Test synonyms", "wordcount": "2"}, + {"name": "Test multiple results", "wordcount": "246"}, + {"name": "Sample 1 test dictionary", "wordcount": "1"}, + {"name": "test_dict", "wordcount": "1"}]' -x -j -l -n --data-dir "$TEST_DIR" test_json '[{"dict": "Test synonyms","word":"test","definition":"\u000aresult of test"}]' -x -j -n --data-dir "$TEST_DIR" foo test_json '[]' -x -j -n --data-dir "$TEST_DIR" foobarbaaz +# Test multiple searches, with the first failing. +test_json '[][{"dict": "Test synonyms","word":"test","definition":"\u000aresult of test"}]' -x -j -n --data-dir "$TEST_DIR" foobarbaaz foo + exit 0 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/tests/t_multiple_results new/sdcv-0.5.4/tests/t_multiple_results --- old/sdcv-0.5.3/tests/t_multiple_results 1970-01-01 01:00:00.000000000 +0100 +++ new/sdcv-0.5.4/tests/t_multiple_results 2022-06-24 20:33:33.000000000 +0200 @@ -0,0 +1,67 @@ +#!/bin/sh + +set -e + +SDCV="$1" +TEST_DIR="$2" + +unset SDCV_PAGER +unset STARDICT_DATA_DIR + +test_json() { + word="$1" + jq_cmp="$2" + result="$("$SDCV" --data-dir "$TEST_DIR" -exjn "$word" | sed 's|\\n|\\u000a|g')" + cmp_result="$(echo "$result" | jq "$jq_cmp")" + if [ "$cmp_result" != "true" ]; then + echo "expected '$jq_cmp' to return true, but $result didn't" + exit 1 + fi +} + +# Basic two-result search for the same headword. +test_json bark \ + '. == [ + {"dict":"Test multiple results","word":"bark","definition":"\u000aThe harsh sound made by a dog."}, + {"dict":"Test multiple results","word":"bark","definition":"\u000aThe tough outer covering of trees and other woody plants."} + ]' + +# Multi-result search where one word exists as both a synyonym and a separate +# headword. This ensures that if there is a matching synyonym we don't skip the +# regular search. +test_json cat \ + '. == [ + {"dict":"Test multiple results","word":"cat","definition":"\u000aA cute animal which (rarely) barks."}, + {"dict":"Test multiple results","word":"lion","definition":"\u000aA larger cat which might bite your head off."}, + {"dict":"Test multiple results","word":"panther","definition":"\u000aI know very little about panthers, sorry."} + ]' + +# Many-result search for a word that matches 120 distinct headwords. +test_json many_headwords 'length == 120' +test_json many_headwords 'all(.word == "many_headwords")' +test_json many_headwords \ + 'to_entries | map(.value.definition == "\u000aDefinition for [many_headwords] entry #\(.key+1) (same headword).") | all' + +# Many-result search for 120 words that have the same synonym. +test_json many_synonyms 'length == 120' +test_json many_synonyms \ + 'to_entries | map(.value.word == "many_synonyms-\(.key+101)") | all' +test_json many_synonyms \ + 'to_entries | map(.value.definition == "\u000aDefinition for [many_synonyms-\(.key+101)] (same synonym).") | all' + +# Ensure that we don't return more than one result even if a word can be +# resolved in more than one way. +# +# Most well-formed dictionaries don't have entries like this (it basically +# requires you to have a dictionary where there is a synonym that is identical +# to a word's headword or multiple identical synyonym entries). +# +# This entry was created by creating extra synonyms with different names then +# modifying the .syn file manually. +test_json many_resolution_paths \ + '. == [ + {"dict":"Test multiple results","word":"many_resolution_paths", + "definition":"\u000aDefinition for [many_resolution_paths] headword (same word, multiple synonym entries)."} + ]' + +exit 0 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/sdcv-0.5.3/tests/t_newlines_in_ifo new/sdcv-0.5.4/tests/t_newlines_in_ifo --- old/sdcv-0.5.3/tests/t_newlines_in_ifo 1970-01-01 01:00:00.000000000 +0100 +++ new/sdcv-0.5.4/tests/t_newlines_in_ifo 2022-06-24 20:33:33.000000000 +0200 @@ -0,0 +1,18 @@ +#!/bin/sh + +set -e + +PATH_TO_SDCV="$1" +TEST_DIR="$2" + +unset SDCV_PAGER +unset STARDICT_DATA_DIR + +RES=$("$PATH_TO_SDCV" -n -x --data-dir="$TEST_DIR/not-unix-newlines-ifo" -l | tail -n 1) + +if [ "$RES" = "Russian-English Dictionary (ru-en) 415144" ]; then + exit 0 +else + echo "test failed, unexpected result: $RES" >&2 + exit 1 +fi
