From b05f11b01e910e8d4449227d43622fbe05d134bb Mon Sep 17 00:00:00 2001 From: WhiredPlanck Date: Sat, 10 Jun 2023 09:58:15 +0800 Subject: [PATCH 1/5] feat,refactor(dict): tweak to reduce peak memory usage - Use smart pointer - Release memory in time This may save about 12% memory usage when compiling dict. --- src/rime/dict/dict_compiler.cc | 10 ++++++---- src/rime/dict/entry_collector.cc | 10 +++++----- src/rime/dict/entry_collector.h | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/rime/dict/dict_compiler.cc b/src/rime/dict/dict_compiler.cc index 22fe0cc691..c2c6ebf6c4 100644 --- a/src/rime/dict/dict_compiler.cc +++ b/src/rime/dict/dict_compiler.cc @@ -236,9 +236,9 @@ bool DictCompiler::BuildTable(int table_index, for (const auto& s : collector.syllabary) { syllable_to_id[s] = syllable_id++; } - for (RawDictEntry& r : collector.entries) { + for (auto r : collector.entries) { Code code; - for (const auto& s : r.raw_code) { + for (const auto& s : r->raw_code) { code.push_back(syllable_to_id[s]); } DictEntryList* ls = vocabulary.LocateEntries(code); @@ -248,10 +248,12 @@ bool DictCompiler::BuildTable(int table_index, } auto e = New(); e->code.swap(code); - e->text.swap(r.text); - e->weight = log(r.weight > 0 ? r.weight : DBL_EPSILON); + e->text.swap(r->text); + e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON); ls->push_back(e); } + // release memory in time to reduce peak memory usage + vector>().swap(collector.entries); if (settings->sort_order() != "original") { vocabulary.SortHomophones(); } diff --git a/src/rime/dict/entry_collector.cc b/src/rime/dict/entry_collector.cc index ad5857b454..f3232c470d 100644 --- a/src/rime/dict/entry_collector.cc +++ b/src/rime/dict/entry_collector.cc @@ -206,7 +206,7 @@ void EntryCollector::CreateEntry(const string &word, words[e.text][code_str] += e.weight; total_weight[e.text] += e.weight; } - entries.push_back(e); + entries.emplace_back(New(e)); ++num_entries; } @@ -240,10 +240,10 @@ void EntryCollector::Dump(const string& file_name) const { out << "# - " << syllable << std::endl; } out << std::endl; - for (const RawDictEntry& e : entries) { - out << e.text << '\t' - << e.raw_code.ToString() << '\t' - << e.weight << std::endl; + for (const auto &e : entries) { + out << e->text << '\t' + << e->raw_code.ToString() << '\t' + << e->weight << std::endl; } out.close(); } diff --git a/src/rime/dict/entry_collector.h b/src/rime/dict/entry_collector.h index c0ce2c7ea8..77bc7ae2ee 100644 --- a/src/rime/dict/entry_collector.h +++ b/src/rime/dict/entry_collector.h @@ -35,7 +35,7 @@ class EntryCollector : public PhraseCollector { public: Syllabary syllabary; bool build_syllabary = true; - vector entries; + vector> entries; size_t num_entries = 0; ReverseLookupTable stems; From 06bcb8c2e09f21448db767180d58098e15574641 Mon Sep 17 00:00:00 2001 From: WhiredPlanck Date: Sat, 10 Jun 2023 11:01:39 +0800 Subject: [PATCH 2/5] refactor(vocabulary.h): align the size of DictEntry struct This may reduce about 1% peak memory usage when compiling dict. --- src/rime/dict/vocabulary.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/rime/dict/vocabulary.h b/src/rime/dict/vocabulary.h index c6a985c6dd..2b3d09af29 100644 --- a/src/rime/dict/vocabulary.h +++ b/src/rime/dict/vocabulary.h @@ -34,10 +34,10 @@ struct DictEntry { string text; string comment; string preedit; - double weight = 0.0; - int commit_count = 0; Code code; // multi-syllable code from prism string custom_code; // user defined code + double weight = 0.0; + int commit_count = 0; int remaining_code_length = 0; DictEntry() = default; From 2644f3c4115958c3b9ea4f6bfd9c910be1c22a25 Mon Sep 17 00:00:00 2001 From: WhiredPlanck Date: Sat, 10 Jun 2023 13:15:00 +0800 Subject: [PATCH 3/5] chore(.gitignore): ignore .cache .cache is usually used by clangd --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 651cad7394..7e4a4b1d5a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ env.bat node_modules/ *~ .*.swp +.cache/ From 210ab6c6a82ea1cd4bb5873672ce8492263d47a4 Mon Sep 17 00:00:00 2001 From: WhiredPlanck Date: Sat, 10 Jun 2023 23:15:50 +0800 Subject: [PATCH 4/5] feat(dict): shorten the data struct used by Vocabulary This may save about 18% peak memory consumption when compiling dict. --- src/rime/dict/dict_compiler.cc | 4 +-- src/rime/dict/table.cc | 6 ++--- src/rime/dict/table.h | 6 ++--- src/rime/dict/vocabulary.cc | 45 ++++++++++++++++++++++++++++------ src/rime/dict/vocabulary.h | 20 +++++++++++++-- test/table_test.cc | 18 +++++++------- 6 files changed, 73 insertions(+), 26 deletions(-) diff --git a/src/rime/dict/dict_compiler.cc b/src/rime/dict/dict_compiler.cc index c2c6ebf6c4..df50c1439b 100644 --- a/src/rime/dict/dict_compiler.cc +++ b/src/rime/dict/dict_compiler.cc @@ -241,12 +241,12 @@ bool DictCompiler::BuildTable(int table_index, for (const auto& s : r->raw_code) { code.push_back(syllable_to_id[s]); } - DictEntryList* ls = vocabulary.LocateEntries(code); + auto ls = vocabulary.LocateEntries(code); if (!ls) { LOG(ERROR) << "Error locating entries in vocabulary."; continue; } - auto e = New(); + auto e = New(); e->code.swap(code); e->text.swap(r->text); e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON); diff --git a/src/rime/dict/table.cc b/src/rime/dict/table.cc index 5d19539217..f0059c2405 100644 --- a/src/rime/dict/table.cc +++ b/src/rime/dict/table.cc @@ -518,7 +518,7 @@ table::TailIndex* Table::BuildTailIndex(const Code& prefix, return index; } -Array* Table::BuildEntryArray(const DictEntryList& entries) { +Array* Table::BuildEntryArray(const ShortDictEntryList& entries) { auto array = CreateArray(entries.size()); if (!array) { return NULL; @@ -531,7 +531,7 @@ Array* Table::BuildEntryArray(const DictEntryList& entries) { return array; } -bool Table::BuildEntryList(const DictEntryList& src, +bool Table::BuildEntryList(const ShortDictEntryList& src, List* dest) { if (!dest) return false; @@ -549,7 +549,7 @@ bool Table::BuildEntryList(const DictEntryList& src, return true; } -bool Table::BuildEntry(const DictEntry& dict_entry, table::Entry* entry) { +bool Table::BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry) { if (!entry) return false; if (!AddString(dict_entry.text, &entry->text, dict_entry.weight)) { diff --git a/src/rime/dict/table.h b/src/rime/dict/table.h index 3c14bddd62..1e4c018c42 100644 --- a/src/rime/dict/table.h +++ b/src/rime/dict/table.h @@ -166,9 +166,9 @@ class Table : public MappedFile { const Vocabulary& vocabulary); bool BuildPhraseIndex(Code code, const Vocabulary& vocabulary, map* index_data); - Array* BuildEntryArray(const DictEntryList& entries); - bool BuildEntryList(const DictEntryList& src, List* dest); - bool BuildEntry(const DictEntry& dict_entry, table::Entry* entry); + Array* BuildEntryArray(const ShortDictEntryList& entries); + bool BuildEntryList(const ShortDictEntryList& src, List* dest); + bool BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry); string GetString(const table::StringType& x); bool AddString(const string& src, table::StringType* dest, diff --git a/src/rime/dict/vocabulary.cc b/src/rime/dict/vocabulary.cc index b4786d0c36..18a8577e5d 100644 --- a/src/rime/dict/vocabulary.cc +++ b/src/rime/dict/vocabulary.cc @@ -5,6 +5,7 @@ // 2011-07-24 GONG Chen // #include +#include #include #include #include @@ -59,6 +60,18 @@ string Code::ToString() const { return stream.str(); } +inline ShortDictEntry DictEntry::ToShort() const { + return {text, code, weight}; +} + +bool ShortDictEntry::operator< (const ShortDictEntry& other) const { + // Sort different entries sharing the same code by weight desc. + if (weight != other.weight) + return weight > other.weight; + // reduce carbon emission + return 0; //text < other.text; +} + bool DictEntry::operator< (const DictEntry& other) const { // Sort different entries sharing the same code by weight desc. if (weight != other.weight) @@ -72,16 +85,34 @@ inline bool dereference_less(const T& a, const T& b) { return *a < *b; } +template +inline void sort(C &container) { + std::sort(std::begin(container), std::end(container), dereference_less); +} + +template +inline void sort_range(C &container, size_t start, size_t count) { + if (start >= container.size()) + return; + auto i(std::begin(container) + start); + auto j(start + count >= container.size() ? std::end(container) : i + count); + std::sort(i, j, dereference_less); +} + +void ShortDictEntryList::Sort() { + sort(*this); +} + +void ShortDictEntryList::SortRange(size_t start, size_t count) { + sort_range(*this, start, count); +} + void DictEntryList::Sort() { - std::sort(begin(), end(), dereference_less); + sort(*this); } void DictEntryList::SortRange(size_t start, size_t count) { - if (start >= size()) - return; - iterator i(begin() + start); - iterator j(start + count >= size() ? end() : i + count); - std::sort(i, j, dereference_less); + sort_range(*this, start, count); } void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) { @@ -96,7 +127,7 @@ void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) { } } -DictEntryList* Vocabulary::LocateEntries(const Code& code) { +ShortDictEntryList* Vocabulary::LocateEntries(const Code& code) { Vocabulary* v = this; size_t n = code.size(); for (size_t i = 0; i < n; ++i) { diff --git a/src/rime/dict/vocabulary.h b/src/rime/dict/vocabulary.h index 2b3d09af29..563622f09d 100644 --- a/src/rime/dict/vocabulary.h +++ b/src/rime/dict/vocabulary.h @@ -30,6 +30,15 @@ class Code : public vector { string ToString() const; }; +struct ShortDictEntry { + string text; + Code code; // multi-syllable code from prism + double weight = 0.0; + + ShortDictEntry() = default; + bool operator< (const ShortDictEntry& other) const; +}; + struct DictEntry { string text; string comment; @@ -41,9 +50,16 @@ struct DictEntry { int remaining_code_length = 0; DictEntry() = default; + ShortDictEntry ToShort() const; bool operator< (const DictEntry& other) const; }; +class ShortDictEntryList : public vector> { + public: + void Sort(); + void SortRange(size_t start, size_t count); +}; + class DictEntryList : public vector> { public: void Sort(); @@ -64,13 +80,13 @@ class DictEntryFilterBinder { class Vocabulary; struct VocabularyPage { - DictEntryList entries; + ShortDictEntryList entries; an next_level; }; class Vocabulary : public map { public: - DictEntryList* LocateEntries(const Code& code); + ShortDictEntryList* LocateEntries(const Code& code); void SortHomophones(); }; diff --git a/test/table_test.cc b/test/table_test.cc index 3724dd25a9..d78e02d654 100644 --- a/test/table_test.cc +++ b/test/table_test.cc @@ -44,7 +44,7 @@ rime::the RimeTableTest::table_; void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll, rime::Vocabulary& voc) { - auto d = rime::New(); + auto d = rime::New(); syll.insert("0"); // no entries for '0', however syll.insert("1"); @@ -52,26 +52,26 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll, d->text = "yi"; d->weight = 1.0; voc[1].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); syll.insert("2"); d->code.back() = 2; d->text = "er"; voc[2].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); d->text = "liang"; voc[2].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); d->text = "lia"; voc[2].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); syll.insert("3"); d->code.back() = 3; d->text = "san"; voc[3].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); d->text = "sa"; voc[3].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); syll.insert("4"); auto lv2 = rime::New(); voc[1].next_level = lv2; @@ -84,11 +84,11 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll, d->code.push_back(3); d->text = "yi-er-san"; (*lv3)[3].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); d->code.push_back(4); d->text = "yi-er-san-si"; (*lv4)[-1].entries.push_back(d); - d = rime::New(*d); + d = rime::New(*d); d->code.resize(3); d->code.push_back(2); d->code.push_back(1); From c61668702a11f55bccebc5a755474ca20794c9f1 Mon Sep 17 00:00:00 2001 From: WhiredPlanck Date: Sun, 11 Jun 2023 18:22:28 +0800 Subject: [PATCH 5/5] refactor(dict_compiler.cc): correct the usage of auto in loop --- src/rime/dict/dict_compiler.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/rime/dict/dict_compiler.cc b/src/rime/dict/dict_compiler.cc index df50c1439b..c448f0499f 100644 --- a/src/rime/dict/dict_compiler.cc +++ b/src/rime/dict/dict_compiler.cc @@ -236,7 +236,7 @@ bool DictCompiler::BuildTable(int table_index, for (const auto& s : collector.syllabary) { syllable_to_id[s] = syllable_id++; } - for (auto r : collector.entries) { + for (const auto& r : collector.entries) { Code code; for (const auto& s : r->raw_code) { code.push_back(syllable_to_id[s]);