Skip to content

Commit

Permalink
Recude Memory Comsumption When Compiling Dictionaries (2) (#669)
Browse files Browse the repository at this point in the history
* refactor(dict_compiler.cc): release the raw codes of the raw dict entries in time

This may save about 25% memory consumption than before (1c43fe5).

* refactor(entry_collector.cc): minor refactor to pushing back RawDictEntry
  • Loading branch information
WhiredPlanck authored Jul 1, 2023
1 parent ab586ca commit 3e0487c
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 17 deletions.
4 changes: 3 additions & 1 deletion src/rime/dict/dict_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,8 @@ bool DictCompiler::BuildTable(int table_index,
for (const auto& s : r->raw_code) {
code.push_back(syllable_to_id[s]);
}
// release memory in time to reduce memory usage
RawCode().swap(r->raw_code);
auto ls = vocabulary.LocateEntries(code);
if (!ls) {
LOG(ERROR) << "Error locating entries in vocabulary.";
Expand All @@ -236,7 +238,7 @@ bool DictCompiler::BuildTable(int table_index,
e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON);
ls->push_back(e);
}
// release memory in time to reduce peak memory usage
// release memory in time to reduce memory usage
vector<of<RawDictEntry>>().swap(collector.entries);
if (settings->sort_order() != "original") {
vocabulary.SortHomophones();
Expand Down
33 changes: 17 additions & 16 deletions src/rime/dict/entry_collector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,13 @@ void EntryCollector::Finish() {
void EntryCollector::CreateEntry(const string& word,
const string& code_str,
const string& weight_str) {
RawDictEntry e;
e.raw_code.FromString(code_str);
e.text = word;
e.weight = 0.0;
an<RawDictEntry> e = New<RawDictEntry>();
e->raw_code.FromString(code_str);
e->text = word;
e->weight = 0.0;
bool scaled = boost::ends_with(weight_str, "%");
if ((weight_str.empty() || scaled) && preset_vocabulary) {
preset_vocabulary->GetWeightForEntry(e.text, &e.weight);
preset_vocabulary->GetWeightForEntry(e->text, &e->weight);
}
if (scaled) {
double percentage = 100.0;
Expand All @@ -167,39 +167,40 @@ void EntryCollector::CreateEntry(const string& word,
LOG(WARNING) << "invalid entry definition at #" << num_entries << ".";
percentage = 100.0;
}
e.weight *= percentage / 100.0;
e->weight *= percentage / 100.0;
} else if (!weight_str.empty()) { // absolute weight
try {
e.weight = std::stod(weight_str);
e->weight = std::stod(weight_str);
} catch (...) {
LOG(WARNING) << "invalid entry definition at #" << num_entries << ".";
e.weight = 0.0;
e->weight = 0.0;
}
}
// learn new syllables, or check if syllables are in the fixed syllabary.
for (const string& s : e.raw_code) {
for (const string& s : e->raw_code) {
if (syllabary.find(s) == syllabary.end()) {
if (build_syllabary) {
syllabary.insert(s);
} else {
LOG(ERROR) << "dropping entry '" << e.text
LOG(ERROR) << "dropping entry '" << e->text
<< "' with invalid syllable: " << s;
return;
}
}
}
// learn new word
bool is_word = (e.raw_code.size() == 1);
bool is_word = (e->raw_code.size() == 1);
if (is_word) {
if (words[e.text].find(code_str) != words[e.text].end()) {
LOG(WARNING) << "duplicate word definition '" << e.text << "': ["
auto& weights = words[e->text];
if (weights.find(code_str) != weights.end()) {
LOG(WARNING) << "duplicate word definition '" << e->text << "': ["
<< code_str << "].";
return;
}
words[e.text][code_str] += e.weight;
total_weight[e.text] += e.weight;
weights[code_str] += e->weight;
total_weight[e->text] += e->weight;
}
entries.emplace_back(New<RawDictEntry>(e));
entries.emplace_back(std::move(e));
++num_entries;
}

Expand Down

0 comments on commit 3e0487c

Please sign in to comment.