Skip to content

Commit 210ab6c

Browse files
committed
feat(dict): shorten the data struct used by Vocabulary
This may save about 18% peak memory consumption when compiling dict.
1 parent 2644f3c commit 210ab6c

File tree

6 files changed

+73
-26
lines changed

6 files changed

+73
-26
lines changed

src/rime/dict/dict_compiler.cc

+2-2
Original file line numberDiff line numberDiff line change
@@ -241,12 +241,12 @@ bool DictCompiler::BuildTable(int table_index,
241241
for (const auto& s : r->raw_code) {
242242
code.push_back(syllable_to_id[s]);
243243
}
244-
DictEntryList* ls = vocabulary.LocateEntries(code);
244+
auto ls = vocabulary.LocateEntries(code);
245245
if (!ls) {
246246
LOG(ERROR) << "Error locating entries in vocabulary.";
247247
continue;
248248
}
249-
auto e = New<DictEntry>();
249+
auto e = New<ShortDictEntry>();
250250
e->code.swap(code);
251251
e->text.swap(r->text);
252252
e->weight = log(r->weight > 0 ? r->weight : DBL_EPSILON);

src/rime/dict/table.cc

+3-3
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ table::TailIndex* Table::BuildTailIndex(const Code& prefix,
518518
return index;
519519
}
520520

521-
Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
521+
Array<table::Entry>* Table::BuildEntryArray(const ShortDictEntryList& entries) {
522522
auto array = CreateArray<table::Entry>(entries.size());
523523
if (!array) {
524524
return NULL;
@@ -531,7 +531,7 @@ Array<table::Entry>* Table::BuildEntryArray(const DictEntryList& entries) {
531531
return array;
532532
}
533533

534-
bool Table::BuildEntryList(const DictEntryList& src,
534+
bool Table::BuildEntryList(const ShortDictEntryList& src,
535535
List<table::Entry>* dest) {
536536
if (!dest)
537537
return false;
@@ -549,7 +549,7 @@ bool Table::BuildEntryList(const DictEntryList& src,
549549
return true;
550550
}
551551

552-
bool Table::BuildEntry(const DictEntry& dict_entry, table::Entry* entry) {
552+
bool Table::BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry) {
553553
if (!entry)
554554
return false;
555555
if (!AddString(dict_entry.text, &entry->text, dict_entry.weight)) {

src/rime/dict/table.h

+3-3
Original file line numberDiff line numberDiff line change
@@ -166,9 +166,9 @@ class Table : public MappedFile {
166166
const Vocabulary& vocabulary);
167167
bool BuildPhraseIndex(Code code, const Vocabulary& vocabulary,
168168
map<string, int>* index_data);
169-
Array<table::Entry>* BuildEntryArray(const DictEntryList& entries);
170-
bool BuildEntryList(const DictEntryList& src, List<table::Entry>* dest);
171-
bool BuildEntry(const DictEntry& dict_entry, table::Entry* entry);
169+
Array<table::Entry>* BuildEntryArray(const ShortDictEntryList& entries);
170+
bool BuildEntryList(const ShortDictEntryList& src, List<table::Entry>* dest);
171+
bool BuildEntry(const ShortDictEntry& dict_entry, table::Entry* entry);
172172

173173
string GetString(const table::StringType& x);
174174
bool AddString(const string& src, table::StringType* dest,

src/rime/dict/vocabulary.cc

+38-7
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
// 2011-07-24 GONG Chen <[email protected]>
66
//
77
#include <algorithm>
8+
#include <iterator>
89
#include <sstream>
910
#include <utility>
1011
#include <rime/dict/vocabulary.h>
@@ -59,6 +60,18 @@ string Code::ToString() const {
5960
return stream.str();
6061
}
6162

63+
inline ShortDictEntry DictEntry::ToShort() const {
64+
return {text, code, weight};
65+
}
66+
67+
bool ShortDictEntry::operator< (const ShortDictEntry& other) const {
68+
// Sort different entries sharing the same code by weight desc.
69+
if (weight != other.weight)
70+
return weight > other.weight;
71+
// reduce carbon emission
72+
return 0; //text < other.text;
73+
}
74+
6275
bool DictEntry::operator< (const DictEntry& other) const {
6376
// Sort different entries sharing the same code by weight desc.
6477
if (weight != other.weight)
@@ -72,16 +85,34 @@ inline bool dereference_less(const T& a, const T& b) {
7285
return *a < *b;
7386
}
7487

88+
template <typename C>
89+
inline void sort(C &container) {
90+
std::sort(std::begin(container), std::end(container), dereference_less<typename C::value_type>);
91+
}
92+
93+
template <typename C>
94+
inline void sort_range(C &container, size_t start, size_t count) {
95+
if (start >= container.size())
96+
return;
97+
auto i(std::begin(container) + start);
98+
auto j(start + count >= container.size() ? std::end(container) : i + count);
99+
std::sort(i, j, dereference_less<typename C::value_type>);
100+
}
101+
102+
void ShortDictEntryList::Sort() {
103+
sort(*this);
104+
}
105+
106+
void ShortDictEntryList::SortRange(size_t start, size_t count) {
107+
sort_range(*this, start, count);
108+
}
109+
75110
void DictEntryList::Sort() {
76-
std::sort(begin(), end(), dereference_less<DictEntryList::value_type>);
111+
sort(*this);
77112
}
78113

79114
void DictEntryList::SortRange(size_t start, size_t count) {
80-
if (start >= size())
81-
return;
82-
iterator i(begin() + start);
83-
iterator j(start + count >= size() ? end() : i + count);
84-
std::sort(i, j, dereference_less<DictEntryList::value_type>);
115+
sort_range(*this, start, count);
85116
}
86117

87118
void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
@@ -96,7 +127,7 @@ void DictEntryFilterBinder::AddFilter(DictEntryFilter filter) {
96127
}
97128
}
98129

99-
DictEntryList* Vocabulary::LocateEntries(const Code& code) {
130+
ShortDictEntryList* Vocabulary::LocateEntries(const Code& code) {
100131
Vocabulary* v = this;
101132
size_t n = code.size();
102133
for (size_t i = 0; i < n; ++i) {

src/rime/dict/vocabulary.h

+18-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ class Code : public vector<SyllableId> {
3030
string ToString() const;
3131
};
3232

33+
struct ShortDictEntry {
34+
string text;
35+
Code code; // multi-syllable code from prism
36+
double weight = 0.0;
37+
38+
ShortDictEntry() = default;
39+
bool operator< (const ShortDictEntry& other) const;
40+
};
41+
3342
struct DictEntry {
3443
string text;
3544
string comment;
@@ -41,9 +50,16 @@ struct DictEntry {
4150
int remaining_code_length = 0;
4251

4352
DictEntry() = default;
53+
ShortDictEntry ToShort() const;
4454
bool operator< (const DictEntry& other) const;
4555
};
4656

57+
class ShortDictEntryList : public vector<of<ShortDictEntry>> {
58+
public:
59+
void Sort();
60+
void SortRange(size_t start, size_t count);
61+
};
62+
4763
class DictEntryList : public vector<of<DictEntry>> {
4864
public:
4965
void Sort();
@@ -64,13 +80,13 @@ class DictEntryFilterBinder {
6480
class Vocabulary;
6581

6682
struct VocabularyPage {
67-
DictEntryList entries;
83+
ShortDictEntryList entries;
6884
an<Vocabulary> next_level;
6985
};
7086

7187
class Vocabulary : public map<int, VocabularyPage> {
7288
public:
73-
DictEntryList* LocateEntries(const Code& code);
89+
ShortDictEntryList* LocateEntries(const Code& code);
7490
void SortHomophones();
7591
};
7692

test/table_test.cc

+9-9
Original file line numberDiff line numberDiff line change
@@ -44,34 +44,34 @@ rime::the<rime::Table> RimeTableTest::table_;
4444

4545
void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
4646
rime::Vocabulary& voc) {
47-
auto d = rime::New<rime::DictEntry>();
47+
auto d = rime::New<rime::ShortDictEntry>();
4848
syll.insert("0");
4949
// no entries for '0', however
5050
syll.insert("1");
5151
d->code.push_back(1);
5252
d->text = "yi";
5353
d->weight = 1.0;
5454
voc[1].entries.push_back(d);
55-
d = rime::New<rime::DictEntry>(*d);
55+
d = rime::New<rime::ShortDictEntry>(*d);
5656
syll.insert("2");
5757
d->code.back() = 2;
5858
d->text = "er";
5959
voc[2].entries.push_back(d);
60-
d = rime::New<rime::DictEntry>(*d);
60+
d = rime::New<rime::ShortDictEntry>(*d);
6161
d->text = "liang";
6262
voc[2].entries.push_back(d);
63-
d = rime::New<rime::DictEntry>(*d);
63+
d = rime::New<rime::ShortDictEntry>(*d);
6464
d->text = "lia";
6565
voc[2].entries.push_back(d);
66-
d = rime::New<rime::DictEntry>(*d);
66+
d = rime::New<rime::ShortDictEntry>(*d);
6767
syll.insert("3");
6868
d->code.back() = 3;
6969
d->text = "san";
7070
voc[3].entries.push_back(d);
71-
d = rime::New<rime::DictEntry>(*d);
71+
d = rime::New<rime::ShortDictEntry>(*d);
7272
d->text = "sa";
7373
voc[3].entries.push_back(d);
74-
d = rime::New<rime::DictEntry>(*d);
74+
d = rime::New<rime::ShortDictEntry>(*d);
7575
syll.insert("4");
7676
auto lv2 = rime::New<rime::Vocabulary>();
7777
voc[1].next_level = lv2;
@@ -84,11 +84,11 @@ void RimeTableTest::PrepareSampleVocabulary(rime::Syllabary& syll,
8484
d->code.push_back(3);
8585
d->text = "yi-er-san";
8686
(*lv3)[3].entries.push_back(d);
87-
d = rime::New<rime::DictEntry>(*d);
87+
d = rime::New<rime::ShortDictEntry>(*d);
8888
d->code.push_back(4);
8989
d->text = "yi-er-san-si";
9090
(*lv4)[-1].entries.push_back(d);
91-
d = rime::New<rime::DictEntry>(*d);
91+
d = rime::New<rime::ShortDictEntry>(*d);
9292
d->code.resize(3);
9393
d->code.push_back(2);
9494
d->code.push_back(1);

0 commit comments

Comments
 (0)