Hello community, here is the log from the commit of package libpinyin for openSUSE:Factory checked in at 2015-02-16 22:12:59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Comparing /work/SRC/openSUSE:Factory/libpinyin (Old) and /work/SRC/openSUSE:Factory/.libpinyin.new (New) ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "libpinyin" Changes: -------- --- /work/SRC/openSUSE:Factory/libpinyin/libpinyin.changes 2014-07-27 18:47:47.000000000 +0200 +++ /work/SRC/openSUSE:Factory/.libpinyin.new/libpinyin.changes 2015-02-16 22:13:00.000000000 +0100 @@ -1,0 +2,10 @@ +Sun Feb 15 04:13:47 UTC 2015 - [email protected] + +- Update to 1.0.99.20150212 + * fixes pinyin_get_pinyin_key_rest_offset method + * fixes pinyin_iterator_get_next_phrase method +- Update patch file: + remove old: libpinyin-1.0.0-avoid_download_at_buildtime.patch + add new: libpinyin-1.0.99.20150212-avoid_download_at_buildtime.patch + +------------------------------------------------------------------- Old: ---- libpinyin-1.0.0-avoid_download_at_buildtime.patch libpinyin-1.0.0.tar.gz model7.text.tar.gz New: ---- libpinyin-1.0.99.20150212-avoid_download_at_buildtime.patch libpinyin-1.0.99.20150212.tar.gz model8.text.tar.gz ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Other differences: ------------------ ++++++ libpinyin.spec ++++++ --- /var/tmp/diff_new_pack.oaWZji/_old 2015-02-16 22:13:02.000000000 +0100 +++ /var/tmp/diff_new_pack.oaWZji/_new 2015-02-16 22:13:02.000000000 +0100 @@ -1,7 +1,7 @@ # # spec file for package libpinyin # -# Copyright (c) 2014 SUSE LINUX Products GmbH, Nuernberg, Germany. +# Copyright (c) 2015 SUSE LINUX Products GmbH, Nuernberg, Germany. # # All modifications and additions to the file contributed by third parties # remain the property of their copyright owners, unless otherwise agreed @@ -16,20 +16,20 @@ # -%define lname libpinyin4 +%define lname libpinyin5 Name: libpinyin -Version: 1.0.0 +Version: 1.0.99.20150212 Release: 0 Summary: Intelligent Pinyin IME License: GPL-2.0+ Group: System/I18n/Chinese Url: https://github.com/libpinyin/libpinyin Source: %{name}-%{version}.tar.gz -Source1: model7.text.tar.gz +Source1: model8.text.tar.gz Source99: baselibs.conf # PATCH-FIX-OPENSUSE [email protected] - avoid download at buildtime -Patch: libpinyin-1.0.0-avoid_download_at_buildtime.patch +Patch: libpinyin-1.0.99.20150212-avoid_download_at_buildtime.patch BuildRequires: gcc-c++ BuildRequires: glib2-devel BuildRequires: gnome-common @@ -93,8 +93,6 @@ %package tools Summary: Tools for libpinyin Group: System/I18n/Chinese -Requires(post): update-alternatives -Requires(postun): update-alternatives %description tools Libpinyin is a staging joint effort of many Chinese Pinyin IME development teams. @@ -103,7 +101,6 @@ This package provides the tools used to make data files. - %prep %setup -q cp -r %{SOURCE1} data/ @@ -118,17 +115,6 @@ %makeinstall find %{buildroot}%{_libdir} -name "*.la" -type f -delete -print -# update-alternatives -mkdir -p %{buildroot}%{_sysconfdir}/alternatives -for alt in gen_binary_files gen_unigram import_interpolation ; do - mv %{buildroot}%{_bindir}/${alt} %{buildroot}%{_bindir}/${alt}-pinyin - mv %{buildroot}%{_mandir}/man1/${alt}.1 %{buildroot}%{_mandir}/man1/${alt}-pinyin.1 - touch %{buildroot}%{_sysconfdir}/alternatives/${alt} - touch %{buildroot}%{_sysconfdir}/alternatives/${alt}.1.gz - ln -sf %{_sysconfdir}/alternatives/${alt} %{buildroot}%{_bindir}/${alt} - ln -sf %{_sysconfdir}/alternatives/${alt}.1.gz %{buildroot}%{_mandir}/man1/ -done - %if 0%{?suse_version} %fdupes %{buildroot} %else @@ -139,54 +125,25 @@ %postun -n %{lname} -p /sbin/ldconfig -%post tools -update-alternatives \ - --install %{_bindir}/gen_binary_files gen_binary_files %{_bindir}/gen_binary_files-pinyin 20 \ - --slave %{_bindir}/gen_unigram gen_unigram %{_bindir}/gen_unigram-pinyin \ - --slave %{_bindir}/import_interpolation import_interpolation %{_bindir}/import_interpolation-pinyin \ - --slave %{_mandir}/man1/gen_binary_files.1.gz gen_binary_files.1.gz %{_mandir}/man1/gen_binary_files-pinyin.1.gz \ - --slave %{_mandir}/man1/gen_unigram.1.gz gen_unigram.1.gz %{_mandir}/man1/gen_unigram-pinyin.1.gz \ - --slave %{_mandir}/man1/import_interpolation.1.gz import_interpolation.1.gz %{_mandir}/man1/import_interpolation-pinyin.1.gz - -%postun tools -if [ $1 -eq 0 ] ; then - update-alternatives --remove gen_binary_files %{_bindir}/gen_binary_files-pinyin -fi - %files -n %{lname} %defattr(-,root,root) %doc ChangeLog AUTHORS COPYING README %{_libdir}/%{name}.so.* %{_mandir}/man1/libpinyin.1.gz +%files data +%defattr(-,root,root) +%{_libdir}/%{name} + %files tools %defattr(-,root,root) %{_bindir}/gen_binary_files %{_bindir}/gen_unigram %{_bindir}/import_interpolation -%{_bindir}/gen_binary_files-pinyin -%{_bindir}/gen_unigram-pinyin -%{_bindir}/import_interpolation-pinyin -%{_mandir}/man1/gen_binary_files.1.gz -%{_mandir}/man1/gen_unigram.1.gz -%{_mandir}/man1/import_interpolation.1.gz -%{_mandir}/man1/gen_binary_files-pinyin.1.gz -%{_mandir}/man1/gen_unigram-pinyin.1.gz -%{_mandir}/man1/import_interpolation-pinyin.1.gz -%ghost %{_sysconfdir}/alternatives/gen_binary_files -%ghost %{_sysconfdir}/alternatives/gen_unigram -%ghost %{_sysconfdir}/alternatives/import_interpolation -%ghost %{_sysconfdir}/alternatives/gen_binary_files.1.gz -%ghost %{_sysconfdir}/alternatives/gen_unigram.1.gz -%ghost %{_sysconfdir}/alternatives/import_interpolation.1.gz - -%files data -%defattr(-,root,root) -%{_libdir}/%{name} %files devel %defattr(-,root,root) -%{_includedir}/%{name}-%{version}/ +%{_includedir}/%{name}-*/ %{_libdir}/%{name}.so %{_libdir}/pkgconfig/%{name}.pc ++++++ libpinyin-1.0.0-avoid_download_at_buildtime.patch -> libpinyin-1.0.99.20150212-avoid_download_at_buildtime.patch ++++++ --- /work/SRC/openSUSE:Factory/libpinyin/libpinyin-1.0.0-avoid_download_at_buildtime.patch 2014-02-28 16:25:02.000000000 +0100 +++ /work/SRC/openSUSE:Factory/.libpinyin.new/libpinyin-1.0.99.20150212-avoid_download_at_buildtime.patch 2015-02-16 22:13:00.000000000 +0100 @@ -8,9 +8,9 @@ - COMMENT - "Downloading textual model data..." - COMMAND -- wget http://downloads.sourceforge.net/libpinyin/models/model7.text.tar.gz +- wget http://downloads.sourceforge.net/libpinyin/models/model8.text.tar.gz COMMAND - tar xvf model7.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data + tar xvf model8.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data ) diff -Nur libpinyin-1.0.0/data/Makefile.am libpinyin-1.0.0-new/data/Makefile.am --- libpinyin-1.0.0/data/Makefile.am 2013-11-25 13:44:41.000000000 +0800 @@ -19,7 +19,7 @@ CLEANFILES = $(binary_model_data) interpolation2.text: -- wget http://downloads.sourceforge.net/libpinyin/models/model7.text.tar.gz - tar xvf model7.text.tar.gz -C $(top_srcdir)/data +- wget http://downloads.sourceforge.net/libpinyin/models/model8.text.tar.gz + tar xvf model8.text.tar.gz -C $(top_srcdir)/data ++++++ libpinyin-1.0.0.tar.gz -> libpinyin-1.0.99.20150212.tar.gz ++++++ diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/ChangeLog new/libpinyin-1.0.99.20150212/ChangeLog --- old/libpinyin-1.0.0/ChangeLog 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/ChangeLog 2015-02-09 08:15:37.000000000 +0100 @@ -1,17 +1,67 @@ +version 1.0.0 +* the first official release of 1.0.x. + +version 0.9.94 +* bug fixes. + +version 0.9.93 +* fixes libpinyin issues from coverity scan report. + +version 0.9.92 +* bug fixes. + +version 0.9.91 +* code re-factor. + +version 0.9.0 +* the first official release of 0.9.x. +* fixes import dictionary. + +version 0.8.93 +* add back pinyin_clear_constraint. + +version 0.8.92 +* fixes model data. + +version 0.8.91 +* multiple dictioniares and user dictionary support. + +version 0.8.1 +* bug fixes. + +version 0.8.0 +* the first official release of 0.8.x. + +version 0.7.92 +* re-factor PhraseLookup class. +* all tests passed simple valgrind memory check. + +version 0.7.91 +* simplify PinyinLookup class. + +version 0.7.1 +* add API to lookup pinyin for characters. + +version 0.7.0 +* the first official release of 0.7.x. + +version 0.6.92 +* draft support for multiple professional phrase libraries. + version 0.6.91 -* support ucs4 characters; -* support guess sentence with prefix; +* support ucs4 characters. +* support guess sentence with prefix. * initially support fuzzy pinyin segment. version 0.6.0 * the first official release of 0.6.x. version 0.5.92 -* fixes new parsers and chewing large table; +* fixes new parsers and chewing large table. * improves pinyin_save. version 0.5.91 -* some code re-factor and simplify; +* some code re-factor and simplify. * fixes the self-learning work around. version 0.5.0 @@ -25,8 +75,8 @@ version 0.4.91 * New parsers for full pinyin/double pinyin/chewing. - * libpinyin now fully supports all pinyin auto corrections in -ibus-pinyin; + * libpinyin now fully supports all pinyin auto corrections of +ibus-pinyin. * libpinyin now better supports an/ang, en/eng, in/ing fuzzy pinyin match. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/configure.ac new/libpinyin-1.0.99.20150212/configure.ac --- old/libpinyin-1.0.0/configure.ac 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/configure.ac 2015-02-09 08:15:37.000000000 +0100 @@ -2,14 +2,20 @@ # Process this file with autoconf to produce a configure script. +# if not 1, append datestamp to the version number. +m4_define([libpinyin_released], [0]) m4_define([libpinyin_major_version], [1]) m4_define([libpinyin_minor_version], [0]) -m4_define([libpinyin_micro_version], [0]) -m4_define([libpinyin_abi_current], [4]) +m4_define([libpinyin_micro_version], [99]) +m4_define(libpinyin_maybe_datestamp, + m4_esyscmd([if test x]libpinyin_released[ != x1; then date +.%Y%m%d | tr -d '\n\r'; fi])) + +m4_define([libpinyin_abi_current], [5]) m4_define([libpinyin_abi_revision], [0]) m4_define([libpinyin_version], - [libpinyin_major_version.libpinyin_minor_version.libpinyin_micro_version]) + libpinyin_major_version.libpinyin_minor_version.libpinyin_micro_version[]libpinyin_maybe_datestamp) + m4_define([libpinyin_binary_version], [libpinyin_abi_current.libpinyin_abi_revision]) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/data/CMakeLists.txt new/libpinyin-1.0.99.20150212/data/CMakeLists.txt --- old/libpinyin-1.0.0/data/CMakeLists.txt 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/data/CMakeLists.txt 2015-02-09 08:15:37.000000000 +0100 @@ -46,9 +46,9 @@ COMMENT "Downloading textual model data..." COMMAND - wget http://downloads.sourceforge.net/libpinyin/models/model7.text.tar.gz + wget http://downloads.sourceforge.net/libpinyin/models/model8.text.tar.gz COMMAND - tar xvf model7.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data + tar xvf model8.text.tar.gz -C ${CMAKE_SOURCE_DIR}/data ) add_custom_command( diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/data/Makefile.am new/libpinyin-1.0.99.20150212/data/Makefile.am --- old/libpinyin-1.0.0/data/Makefile.am 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/data/Makefile.am 2015-02-09 08:15:37.000000000 +0100 @@ -48,11 +48,11 @@ CLEANFILES = $(binary_model_data) interpolation2.text: - wget http://downloads.sourceforge.net/libpinyin/models/model7.text.tar.gz - tar xvf model7.text.tar.gz -C $(top_srcdir)/data + wget http://downloads.sourceforge.net/libpinyin/models/model8.text.tar.gz + tar xvf model8.text.tar.gz -C $(top_srcdir)/data -$(tablefiles): interpolation2.text +$(tablefiles) table.conf: interpolation2.text bigram.db: $(textual_model_data) $(RM) $(binary_model_data) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/data/table.conf new/libpinyin-1.0.99.20150212/data/table.conf --- old/libpinyin-1.0.0/data/table.conf 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/data/table.conf 1970-01-01 01:00:00.000000000 +0100 @@ -1,17 +0,0 @@ -binary format version:3 -model data version:7 -lambda parameter:0.276607 - -4 art.table art.bin art.dbin DICTIONARY -5 culture.table culture.bin culture.dbin DICTIONARY -6 economy.table economy.bin economy.dbin DICTIONARY -7 geology.table geology.bin geology.dbin DICTIONARY -8 history.table history.bin history.dbin DICTIONARY - -9 life.table life.bin life.dbin DICTIONARY -10 nature.table nature.bin nature.dbin DICTIONARY -11 scitech.table scitech.bin scitech.dbin DICTIONARY -12 society.table society.bin society.dbin DICTIONARY -13 sport.table sport.bin sport.dbin DICTIONARY - -14 NULL NULL network.bin USER_FILE \ No newline at end of file diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/doc/Makefile.am new/libpinyin-1.0.99.20150212/doc/Makefile.am --- old/libpinyin-1.0.0/doc/Makefile.am 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/doc/Makefile.am 2015-02-09 08:15:37.000000000 +0100 @@ -16,9 +16,6 @@ ## Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -man_MANS = libpinyin.1 \ - gen_binary_files.1 \ - import_interpolation.1 \ - gen_unigram.1 +man_MANS = libpinyin.1 EXTRA_DIST = $(man_MANS) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/doc/gen_binary_files.1 new/libpinyin-1.0.99.20150212/doc/gen_binary_files.1 --- old/libpinyin-1.0.0/doc/gen_binary_files.1 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/doc/gen_binary_files.1 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -.so man1/libpinyin.1 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/doc/gen_unigram.1 new/libpinyin-1.0.99.20150212/doc/gen_unigram.1 --- old/libpinyin-1.0.0/doc/gen_unigram.1 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/doc/gen_unigram.1 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -.so man1/libpinyin.1 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/doc/import_interpolation.1 new/libpinyin-1.0.99.20150212/doc/import_interpolation.1 --- old/libpinyin-1.0.0/doc/import_interpolation.1 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/doc/import_interpolation.1 1970-01-01 01:00:00.000000000 +0100 @@ -1 +0,0 @@ -.so man1/libpinyin.1 diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/doc/libpinyin.1 new/libpinyin-1.0.99.20150212/doc/libpinyin.1 --- old/libpinyin-1.0.0/doc/libpinyin.1 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/doc/libpinyin.1 2015-02-09 08:15:37.000000000 +0100 @@ -25,11 +25,9 @@ gen_unigram .SH EXAMPLE -Download the model.text.tar.gz, and extracts all files into a folder, then run the commands below to generate the binary model data. +Download the model.text.tar.gz, and extracts all files into the data sub-directory, then run the commands below to generate the binary model data. .RS -rm gb_char.bin gbk_char.bin phrase_index.bin pinyin_index.bin bigram.db - gen_binary_files --table-dir ../data import_interpolation < ../data/interpolation.text diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/libpinyin.ver new/libpinyin-1.0.99.20150212/src/libpinyin.ver --- old/libpinyin-1.0.0/src/libpinyin.ver 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/libpinyin.ver 2015-02-09 08:15:37.000000000 +0100 @@ -9,13 +9,19 @@ pinyin_begin_add_phrases; pinyin_iterator_add_phrase; pinyin_end_add_phrases; + pinyin_begin_get_phrases; + pinyin_iterator_has_next_phrase; + pinyin_iterator_get_next_phrase; + pinyin_end_get_phrases; pinyin_fini; pinyin_mask_out; pinyin_set_options; pinyin_alloc_instance; pinyin_free_instance; + pinyin_get_context; pinyin_guess_sentence; pinyin_guess_sentence_with_prefix; + pinyin_guess_predicted_candidates; pinyin_phrase_segment; pinyin_get_sentence; pinyin_parse_full_pinyin; @@ -24,10 +30,12 @@ pinyin_parse_more_double_pinyins; pinyin_parse_chewing; pinyin_parse_more_chewings; + pinyin_get_parsed_input_length; pinyin_in_chewing_keyboard; pinyin_guess_candidates; pinyin_guess_full_pinyin_candidates; pinyin_choose_candidate; + pinyin_choose_predicted_candidate; pinyin_clear_constraint; pinyin_lookup_tokens; pinyin_train; @@ -35,6 +43,7 @@ pinyin_get_chewing_string; pinyin_get_pinyin_string; pinyin_get_pinyin_strings; + pinyin_get_pinyin_is_incomplete; pinyin_token_get_phrase; pinyin_token_get_n_pronunciation; pinyin_token_get_nth_pronunciation; @@ -49,6 +58,7 @@ pinyin_get_pinyin_key_rest; pinyin_get_pinyin_key_rest_positions; pinyin_get_pinyin_key_rest_length; + pinyin_get_pinyin_key_rest_offset; pinyin_get_raw_full_pinyin; pinyin_get_n_phrase; pinyin_get_phrase_token; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/pinyin.cpp new/libpinyin-1.0.99.20150212/src/pinyin.cpp --- old/libpinyin-1.0.0/src/pinyin.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/pinyin.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -62,6 +62,7 @@ TokenVector m_prefixes; ChewingKeyVector m_pinyin_keys; ChewingKeyRestVector m_pinyin_key_rests; + size_t m_parsed_len; CandidateConstraints m_constraints; MatchResults m_match_results; CandidateVector m_candidates; @@ -89,6 +90,13 @@ guint8 m_phrase_index; }; +struct _export_iterator_t{ + pinyin_context_t * m_context; + guint8 m_phrase_index; + /* null token means no next item. */ + phrase_token_t m_next_token; + guint8 m_next_pronunciation; +}; static bool check_format(pinyin_context_t * context){ const char * userdir = context->m_user_dir; @@ -483,6 +491,119 @@ delete iter; } +export_iterator_t * pinyin_begin_get_phrases(pinyin_context_t * context, + guint index){ + export_iterator_t * iter = new export_iterator_t; + iter->m_context = context; + iter->m_phrase_index = index; + iter->m_next_token = null_token; + iter->m_next_pronunciation = 0; + + /* probe next token. */ + PhraseIndexRange range; + int retval = iter->m_context->m_phrase_index->get_range + (iter->m_phrase_index, range); + if (retval != ERROR_OK) + return iter; + + PhraseItem item; + phrase_token_t token = range.m_range_begin; + for (; token < range.m_range_end; ++token) { + retval = iter->m_context->m_phrase_index->get_phrase_item + (token, item); + if (ERROR_OK == retval && item.get_n_pronunciation() >= 1) { + iter->m_next_token = token; + break; + } + } + return iter; +} + +bool pinyin_iterator_has_next_phrase(export_iterator_t * iter){ + /* no next token. */ + if (null_token == iter->m_next_token) + return false; + return true; +} + +/* phrase, pinyin should be freed by g_free(). */ +bool pinyin_iterator_get_next_phrase(export_iterator_t * iter, + gchar ** phrase, + gchar ** pinyin, + gint * count){ + /* count "-1" means default count. */ + *phrase = NULL; *pinyin = NULL; *count = -1; + + PhraseItem item; + int retval = iter->m_context->m_phrase_index->get_phrase_item + (iter->m_next_token, item); + /* assume valid next token from previous call. */ + assert(ERROR_OK == retval); + + /* fill phrase and pronunciation pair. */ + ucs4_t phrase_ucs4[MAX_PHRASE_LENGTH]; + guint8 len = item.get_phrase_length(); + assert(item.get_phrase_string(phrase_ucs4)); + gchar * phrase_utf8 = g_ucs4_to_utf8 + (phrase_ucs4, len, NULL, NULL, NULL); + + guint8 nth_pronun = iter->m_next_pronunciation; + guint8 n_pronuns = item.get_n_pronunciation(); + /* assume valid pronunciation from previous call. */ + assert(nth_pronun < n_pronuns); + ChewingKey keys[MAX_PHRASE_LENGTH]; + guint32 freq = 0; + assert(item.get_nth_pronunciation(nth_pronun, keys, freq)); + + GPtrArray * array = g_ptr_array_new(); + for(size_t i = 0; i < len; ++i) { + g_ptr_array_add(array, keys[i].get_pinyin_string()); + } + g_ptr_array_add(array, NULL); + + gchar ** strings = (gchar **)g_ptr_array_free(array, FALSE); + gchar * pinyins = g_strjoinv("'", strings); + g_strfreev(strings); + + /* use default value. */ + *phrase = phrase_utf8; *pinyin = pinyins; + if (freq > 0) + *count = freq; + + /* probe next pronunciation. */ + nth_pronun ++; + if (nth_pronun < n_pronuns) { + iter->m_next_pronunciation = nth_pronun; + return true; + } + + iter->m_next_pronunciation = 0; + /* probe next token. */ + PhraseIndexRange range; + retval = iter->m_context->m_phrase_index->get_range + (iter->m_phrase_index, range); + if (retval != ERROR_OK) { + iter->m_next_token = null_token; + return true; + } + + phrase_token_t token = iter->m_next_token + 1; + iter->m_next_token = null_token; + for (; token < range.m_range_end; ++token) { + retval = iter->m_context->m_phrase_index->get_phrase_item + (token, item); + if (ERROR_OK == retval && item.get_n_pronunciation() >= 1) { + iter->m_next_token = token; + break; + } + } + return true; +} + +void pinyin_end_get_phrases(export_iterator_t * iter){ + delete iter; +} + bool pinyin_save(pinyin_context_t * context){ if (!context->m_user_dir) return false; @@ -753,6 +874,9 @@ instance->m_pinyin_keys = g_array_new(FALSE, FALSE, sizeof(ChewingKey)); instance->m_pinyin_key_rests = g_array_new(FALSE, FALSE, sizeof(ChewingKeyRest)); + + instance->m_parsed_len = 0; + instance->m_constraints = g_array_new (TRUE, FALSE, sizeof(lookup_constraint_t)); instance->m_match_results = @@ -775,6 +899,9 @@ delete instance; } +pinyin_context_t * pinyin_get_context (pinyin_instance_t * instance){ + return instance->m_context; +} static bool pinyin_update_constraints(pinyin_instance_t * instance){ pinyin_context_t * & context = instance->m_context; @@ -812,15 +939,12 @@ return retval; } -bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, - const char * prefix){ +static void _compute_prefixes(pinyin_instance_t * instance, + const char * prefix){ pinyin_context_t * & context = instance->m_context; FacadePhraseIndex * & phrase_index = context->m_phrase_index; - g_array_set_size(instance->m_prefixes, 0); - g_array_append_val(instance->m_prefixes, sentence_start); - glong len_str = 0; ucs4_t * ucs4_str = g_utf8_to_ucs4(prefix, -1, NULL, &len_str, NULL); GArray * tokenarray = g_array_new(FALSE, FALSE, sizeof(phrase_token_t)); @@ -847,6 +971,16 @@ } g_array_free(tokenarray, TRUE); g_free(ucs4_str); +} + +bool pinyin_guess_sentence_with_prefix(pinyin_instance_t * instance, + const char * prefix){ + pinyin_context_t * & context = instance->m_context; + + g_array_set_size(instance->m_prefixes, 0); + g_array_append_val(instance->m_prefixes, sentence_start); + + _compute_prefixes(instance, prefix); pinyin_update_constraints(instance); bool retval = context->m_pinyin_lookup->get_best_match @@ -906,11 +1040,12 @@ instance->m_raw_full_pinyin = g_strdup(pinyins); int pinyin_len = strlen(pinyins); - int parse_len = context->m_full_pinyin_parser->parse + int parsed_len = context->m_full_pinyin_parser->parse ( context->m_options, instance->m_pinyin_keys, instance->m_pinyin_key_rests, pinyins, pinyin_len); - return parse_len; + instance->m_parsed_len = parsed_len; + return parsed_len; } bool pinyin_parse_double_pinyin(pinyin_instance_t * instance, @@ -929,11 +1064,12 @@ pinyin_context_t * & context = instance->m_context; int pinyin_len = strlen(pinyins); - int parse_len = context->m_double_pinyin_parser->parse + int parsed_len = context->m_double_pinyin_parser->parse ( context->m_options, instance->m_pinyin_keys, instance->m_pinyin_key_rests, pinyins, pinyin_len); - return parse_len; + instance->m_parsed_len = parsed_len; + return parsed_len; } bool pinyin_parse_chewing(pinyin_instance_t * instance, @@ -952,11 +1088,16 @@ pinyin_context_t * & context = instance->m_context; int chewing_len = strlen(chewings); - int parse_len = context->m_chewing_parser->parse + int parsed_len = context->m_chewing_parser->parse ( context->m_options, instance->m_pinyin_keys, instance->m_pinyin_key_rests, chewings, chewing_len); - return parse_len; + instance->m_parsed_len = parsed_len; + return parsed_len; +} + +size_t pinyin_get_parsed_input_length(pinyin_instance_t * instance) { + return instance->m_parsed_len; } bool pinyin_in_chewing_keyboard(pinyin_instance_t * instance, @@ -1164,6 +1305,7 @@ case NORMAL_CANDIDATE: case DIVIDED_CANDIDATE: case RESPLIT_CANDIDATE: + case PREDICTED_CANDIDATE: pinyin_token_get_phrase (instance, candidate->m_token, NULL, &(candidate->m_phrase_string)); @@ -1751,12 +1893,98 @@ return true; } +bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, + const char * prefix) { + const guint32 filter = 256; + + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + GArray * & prefixes = instance->m_prefixes; + + _free_candidates(instance->m_candidates); + + _compute_prefixes(instance, prefix); + + phrase_token_t prev_token = _get_previous_token(instance, 0); + if (null_token == prev_token) + return false; + + /* merge single gram. */ + SingleGram merged_gram; + SingleGram * system_gram = NULL, * user_gram = NULL; + context->m_system_bigram->load(prev_token, system_gram); + context->m_user_bigram->load(prev_token, user_gram); + merge_single_gram(&merged_gram, system_gram, user_gram); + + GArray * items = g_array_new(FALSE, FALSE, sizeof(lookup_candidate_t)); + + /* retrieve all items. */ + BigramPhraseWithCountArray tokens = g_array_new + (FALSE, FALSE, sizeof(BigramPhraseItemWithCount)); + merged_gram.retrieve_all(tokens); + + /* sort the longer word first. */ + PhraseItem cached_item; + for (size_t len = MAX_PHRASE_LENGTH; len > 0; --len) { + /* append items. */ + for (size_t k = 0; k < tokens->len; ++k){ + BigramPhraseItemWithCount * phrase_item = &g_array_index + (tokens, BigramPhraseItemWithCount, k); + + if (phrase_item->m_count < filter) + continue; + + int result = phrase_index->get_phrase_item + (phrase_item->m_token, cached_item); + if (ERROR_NO_SUB_PHRASE_INDEX == result) + continue; + + if (len != cached_item.get_phrase_length()) + continue; + + lookup_candidate_t item; + item.m_candidate_type = PREDICTED_CANDIDATE; + item.m_token = phrase_item->m_token; + g_array_append_val(items, item); + } + + _compute_frequency_of_items(context, prev_token, &merged_gram, items); + + /* sort the candidates of the same length by frequency. */ + g_array_sort(items, compare_item_with_frequency); + + /* transfer back items to tokens, and save it into candidates */ + for (size_t k = 0; k < items->len; ++k) { + lookup_candidate_t * item = &g_array_index + (items, lookup_candidate_t, k); + g_array_append_val(instance->m_candidates, *item); + } + } + + g_array_free(items, TRUE); + if (system_gram) + delete system_gram; + if (user_gram) + delete user_gram; + + /* post process to remove duplicated candidates */ + _compute_phrase_strings_of_items(instance, 0, instance->m_candidates); + + _remove_duplicated_items_by_phrase_string(instance, instance->m_candidates); + + return true; +} int pinyin_choose_candidate(pinyin_instance_t * instance, size_t offset, lookup_candidate_t * candidate){ + assert(PREDICTED_CANDIDATE != candidate->m_candidate_type); + pinyin_context_t * & context = instance->m_context; + if (BEST_MATCH_CANDIDATE == candidate->m_candidate_type) + return instance->m_pinyin_keys->len; + if (DIVIDED_CANDIDATE == candidate->m_candidate_type || RESPLIT_CANDIDATE == candidate->m_candidate_type) { /* update full pinyin. */ @@ -1795,6 +2023,49 @@ return offset + len; } +bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, + lookup_candidate_t * candidate){ + assert(PREDICTED_CANDIDATE == candidate->m_candidate_type); + + const guint32 initial_seed = 23 * 3; + const guint32 unigram_factor = 7; + + pinyin_context_t * & context = instance->m_context; + FacadePhraseIndex * & phrase_index = context->m_phrase_index; + GArray * & prefixes = instance->m_prefixes; + + /* train uni-gram */ + phrase_token_t token = candidate->m_token; + int error = phrase_index->add_unigram_frequency + (token, initial_seed * unigram_factor); + if (ERROR_INTEGER_OVERFLOW == error) + return false; + + phrase_token_t prev_token = _get_previous_token(instance, 0); + if (null_token == prev_token) + return false; + + SingleGram * user_gram = NULL; + context->m_user_bigram->load(prev_token, user_gram); + + if (NULL == user_gram) + user_gram = new SingleGram; + + /* train bi-gram */ + guint32 total_freq = 0; + assert(user_gram->get_total_freq(total_freq)); + guint32 freq = 0; + if (!user_gram->get_freq(token, freq)) { + assert(user_gram->insert_freq(token, initial_seed)); + } else { + assert(user_gram->set_freq(token, freq + initial_seed)); + } + assert(user_gram->set_total_freq(total_freq + initial_seed)); + context->m_user_bigram->store(prev_token, user_gram); + delete user_gram; + return true; +} + bool pinyin_clear_constraint(pinyin_instance_t * instance, size_t offset){ pinyin_context_t * & context = instance->m_context; @@ -1840,6 +2111,7 @@ bool pinyin_reset(pinyin_instance_t * instance){ g_free(instance->m_raw_full_pinyin); instance->m_raw_full_pinyin = NULL; + instance->m_parsed_len = 0; g_array_set_size(instance->m_prefixes, 0); g_array_set_size(instance->m_pinyin_keys, 0); @@ -1887,6 +2159,15 @@ return true; } +bool pinyin_get_pinyin_is_incomplete(pinyin_instance_t * instance, + ChewingKey * key) { + if (CHEWING_ZERO_MIDDLE == key->m_middle && + CHEWING_ZERO_FINAL == key->m_final) + return true; + + return false; +} + bool pinyin_token_get_phrase(pinyin_instance_t * instance, phrase_token_t token, guint * len, @@ -2063,6 +2344,37 @@ return true; } +bool pinyin_get_pinyin_key_rest_offset(pinyin_instance_t * instance, + guint16 cursor, + guint16 * offset) { + guint len = 0; + assert (instance->m_pinyin_keys->len == + instance->m_pinyin_key_rests->len); + len = instance->m_pinyin_key_rests->len; + + ChewingKeyRestVector & pinyin_key_rests = + instance->m_pinyin_key_rests; + + guint inner_cursor = len; + + guint16 prev_end = 0, cur_end; + for (size_t i = 0; i < len; ++i) { + ChewingKeyRest *pos = NULL; + pos = &g_array_index(pinyin_key_rests, ChewingKeyRest, i); + cur_end = pos->m_raw_end; + + if (prev_end <= cursor && cursor < cur_end) + inner_cursor = i; + + prev_end = cur_end; + } + + assert (inner_cursor >= 0); + *offset = inner_cursor; + + return true; +} + bool pinyin_get_raw_full_pinyin(pinyin_instance_t * instance, const gchar ** utf8_str) { *utf8_str = instance->m_raw_full_pinyin; diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/pinyin.h new/libpinyin-1.0.99.20150212/src/pinyin.h --- old/libpinyin-1.0.0/src/pinyin.h 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/pinyin.h 2015-02-09 08:15:37.000000000 +0100 @@ -38,13 +38,15 @@ typedef struct _lookup_candidate_t lookup_candidate_t; typedef struct _import_iterator_t import_iterator_t; +typedef struct _export_iterator_t export_iterator_t; typedef enum _lookup_candidate_type_t{ BEST_MATCH_CANDIDATE = 1, NORMAL_CANDIDATE, DIVIDED_CANDIDATE, RESPLIT_CANDIDATE, - ZOMBIE_CANDIDATE + ZOMBIE_CANDIDATE, + PREDICTED_CANDIDATE } lookup_candidate_type_t; /** @@ -120,6 +122,53 @@ void pinyin_end_add_phrases(import_iterator_t * iter); /** + * pinyin_begin_get_phrases: + * @context: the pinyin context. + * @index: the phrase index to be exported. + * @returns: the export iterator. + * + * Begin to get phrases. + * + */ +export_iterator_t * pinyin_begin_get_phrases(pinyin_context_t * context, + guint index); + +/** + * pinyin_iterator_has_next_phrase: + * @iter: the export iterator. + * @returns: whether the iterator has the next phrase. + * + * Check whether the iterator has the next phrase. + * + */ +bool pinyin_iterator_has_next_phrase(export_iterator_t * iter); + +/** + * pinyin_iterator_get_next_phrase: + * @iter: the export iterator. + * @phrase: the phrase string. + * @pinyin: the pinyin string. + * @count: the count of the phrase/pinyin pair, -1 means the default value. + * @returns: whether the get next phrase operation succeeded. + * + * Get a pair of phrase and pinyin with count. + * + */ +bool pinyin_iterator_get_next_phrase(export_iterator_t * iter, + gchar ** phrase, + gchar ** pinyin, + gint * count); + +/** + * pinyin_end_get_phrases: + * @iter: the export iterator. + * + * End getting phrases. + * + */ +void pinyin_end_get_phrases(export_iterator_t * iter); + +/** * pinyin_save: * @context: the pinyin context to be saved into user directory. * @returns: whether the save succeeded. @@ -209,6 +258,15 @@ */ void pinyin_free_instance(pinyin_instance_t * instance); +/** + * pinyin_get_context: + * @instance: the pinyin instance. + * @returns: the pinyin context. + * + * Get the pinyin context from the pinyin instance. + * + */ +pinyin_context_t * pinyin_get_context (pinyin_instance_t * instance); /** * pinyin_guess_sentence: @@ -233,6 +291,18 @@ const char * prefix); /** + * pinyin_guess_predicted_candidates: + * @instance: the pinyin instance. + * @prefix: the prefix before the predicted candidates. + * @returns: whether the predicted candidates are guessed successfully. + * + * Guess the predicted candidates after the prefix word. + * + */ +bool pinyin_guess_predicted_candidates(pinyin_instance_t * instance, + const char * prefix); + +/** * pinyin_phrase_segment: * @instance: the pinyin instance. * @sentence: the utf-8 sentence to be segmented. @@ -337,6 +407,17 @@ const char * chewings); /** + * pinyin_get_parsed_input_length: + * @instance: the pinyin instance. + * @returns: the parsed_length of the input. + * + * Get the parsed length of the input. + * + */ +size_t pinyin_get_parsed_input_length(pinyin_instance_t * instance); + + +/** * pinyin_in_chewing_keyboard: * @instance: the pinyin instance. * @key: the input key. @@ -387,6 +468,18 @@ lookup_candidate_t * candidate); /** + * pinyin_choose_predicted_candidate: + * @instance: the pinyin instance. + * @candidate: the selected candidate. + * @returns: whether the self-learning is successful. + * + * Choose a predicted candidate. + * + */ +bool pinyin_choose_predicted_candidate(pinyin_instance_t * instance, + lookup_candidate_t * candidate); + +/** * pinyin_clear_constraint: * @instance: the pinyin instance. * @offset: the offset in the pinyin keys. @@ -476,6 +569,18 @@ gchar ** yunmu); /** + * pinyin_get_pinyin_is_incomplete: + * @instance: the pinyin instance. + * @key: the pinyin key. + * @returns: whether the pinyin key is incomplete pinyin. + * + * Check whether the pinyin key is incomplete pinyin. + * + */ +bool pinyin_get_pinyin_is_incomplete(pinyin_instance_t * instance, + ChewingKey * key); + +/** * pinyin_token_get_phrase: * @instance: the pinyin instance. * @token: the phrase token. @@ -672,6 +777,20 @@ guint16 * length); /** + * pinyin_get_pinyin_key_rest_offset: + * @instance: the pinyin instance. + * @cursor: the cursor. + * @offset: the offset in the pinyin array. + * @returns: whether the get operation is successful. + * + * Get the offset in the pinyin key array. + * + */ +bool pinyin_get_pinyin_key_rest_offset(pinyin_instance_t * instance, + guint16 cursor, + guint16 * offset); + +/** * pinyin_get_raw_full_pinyin: * @instance: the pinyin instance. * @utf8_str: the modified raw full pinyin after choose candidate. diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/storage/chewing_large_table.cpp new/libpinyin-1.0.99.20150212/src/storage/chewing_large_table.cpp --- old/libpinyin-1.0.0/src/storage/chewing_large_table.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/storage/chewing_large_table.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -668,7 +668,7 @@ size_t freq; while (!feof(infile)) { - int num = fscanf(infile, "%s %s %u %ld", + int num = fscanf(infile, "%256s %256s %u %ld", pinyin, phrase, &token, &freq); if (4 != num) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/storage/phrase_index.cpp new/libpinyin-1.0.99.20150212/src/storage/phrase_index.cpp --- old/libpinyin-1.0.0/src/storage/phrase_index.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/storage/phrase_index.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -528,7 +528,7 @@ phrase_token_t cur_token = 0; while (!feof(infile)){ - int num = fscanf(infile, "%s %s %u %ld", + int num = fscanf(infile, "%256s %256s %u %ld", pinyin, phrase, &token, &freq); if (4 != num) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/storage/phrase_large_table2.cpp new/libpinyin-1.0.99.20150212/src/storage/phrase_large_table2.cpp --- old/libpinyin-1.0.0/src/storage/phrase_large_table2.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/storage/phrase_large_table2.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -473,7 +473,7 @@ size_t freq; while (!feof(infile)) { - int num = fscanf(infile, "%s %s %u %ld", + int num = fscanf(infile, "%256s %256s %u %ld", pinyin, phrase, &token, &freq); if (4 != num) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/src/storage/table_info.cpp new/libpinyin-1.0.99.20150212/src/storage/table_info.cpp --- old/libpinyin-1.0.0/src/storage/table_info.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/src/storage/table_info.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -162,7 +162,7 @@ int index = 0; char tablefile[256], sysfile[256], userfile[256], filetype[256]; while (!feof(input)) { - num = fscanf(input, "%d %s %s %s %s\n", + num = fscanf(input, "%d %256s %256s %256s %256s\n", &index, tablefile, sysfile, userfile, filetype); if (5 != num) diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' '--exclude=.svnignore' old/libpinyin-1.0.0/utils/storage/gen_pinyin_table.cpp new/libpinyin-1.0.99.20150212/utils/storage/gen_pinyin_table.cpp --- old/libpinyin-1.0.0/utils/storage/gen_pinyin_table.cpp 2013-11-25 06:44:41.000000000 +0100 +++ new/libpinyin-1.0.99.20150212/utils/storage/gen_pinyin_table.cpp 2015-02-09 08:15:37.000000000 +0100 @@ -143,7 +143,7 @@ } while ( !feof(infile)){ - int num = fscanf(infile, "%s %s %u", + int num = fscanf(infile, "%1024s %1024s %u", phrase, pinyin, &freq); if (3 != num) ++++++ model7.text.tar.gz -> model8.text.tar.gz ++++++ /work/SRC/openSUSE:Factory/libpinyin/model7.text.tar.gz /work/SRC/openSUSE:Factory/.libpinyin.new/model8.text.tar.gz differ: char 5, line 1 -- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
