Skip to content

Commit aba381d

Browse files
author
Andy C
committed
[mycpp/runtime] Fix hash table pileup perf bug for mops::BigInt
I just added this hash function. But I also fixed the one for int (32-bit).
1 parent 0825a6d commit aba381d

File tree

4 files changed

+118
-42
lines changed

4 files changed

+118
-42
lines changed

demo/sparse-array.sh

Lines changed: 79 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,41 @@ set -o nounset
2626
set -o pipefail
2727
set -o errexit
2828

29+
compare-x() {
30+
local x=$1
31+
32+
local osh=_bin/cxx-opt/osh
33+
ninja $osh
34+
35+
echo ===
36+
echo $osh SparseArray
37+
echo
38+
time sparse-$x $osh
39+
40+
for sh in bash $osh; do
41+
echo ===
42+
echo $sh
43+
echo
44+
time $sh $0 $x
45+
done
46+
}
47+
48+
compare-sum-shift() {
49+
compare-x sum-shift
50+
}
51+
52+
compare-append-sparse() {
53+
compare-x append-sparse
54+
}
55+
56+
compare-append-dense() {
57+
compare-x append-dense
58+
}
59+
60+
#
61+
# Workloads
62+
#
63+
2964
sum-shift() {
3065
local n=${1:-1000}
3166

@@ -112,26 +147,6 @@ f
112147
EOF
113148
}
114149

115-
compare-sum-shift() {
116-
# more like 1M iterations - 1.8 seconds in bash
117-
# So that's 1.8 ms for 1000 iterations
118-
119-
local osh=_bin/cxx-opt/osh
120-
ninja $osh
121-
122-
echo ===
123-
echo $osh SparseArray
124-
echo
125-
time sparse-sum-shift $osh
126-
127-
for sh in bash $osh; do
128-
echo ===
129-
echo $sh
130-
echo
131-
time $sh $0 sum-shift
132-
done
133-
}
134-
135150
append-sparse() {
136151
local n=${1:-24} # up to 2^n
137152
local m=${2:-2000}
@@ -182,21 +197,52 @@ f
182197
EOF
183198
}
184199

185-
compare-append-sparse() {
186-
local osh=_bin/cxx-opt/osh
187-
ninja $osh
200+
append-dense() {
201+
local n=${1:-24} # up to 2^n
202+
local m=${2:-2000}
188203

189-
echo ===
190-
echo $osh SparseArray
191-
echo
192-
time sparse-append-sparse $osh
204+
to_append=( $(seq $m) ) # split words
193205

194-
for sh in bash $osh; do
195-
echo ===
196-
echo $sh
197-
echo
198-
time $sh $0 append-sparse
199-
done
206+
a=()
207+
for (( i = 0; i < n; ++i )) {
208+
a+=( ${to_append[@]} )
209+
}
210+
#echo ${a[@]}
211+
#echo ${!a[@]}
212+
echo ${#a[@]}
213+
}
214+
215+
sparse-append-dense() {
216+
local osh=${1:-_bin/cxx-opt/osh}
217+
local n=${2:-24}
218+
local m=${3:-2000}
219+
220+
NUM_ITERS=$n TO_APPEND=$m $osh <<'EOF'
221+
n=$NUM_ITERS
222+
m=$TO_APPEND
223+
to_append=( $(seq $m) ) # split words before ysh:upgrade
224+
225+
shopt --set ysh:upgrade
226+
227+
f() {
228+
a=()
229+
var sp = _a2sp(a)
230+
231+
for (( i = 0; i < n; ++i )) {
232+
#call _opsp(sp, 'set', 1 << i, str(i))
233+
call _opsp(sp, 'append', to_append)
234+
235+
#time call _opsp(sp, 'append', to_append)
236+
#echo $[_opsp(sp, 'len')]
237+
#echo
238+
}
239+
echo $[_opsp(sp, 'len')]
240+
#echo @[_opsp(sp, 'subst')]
241+
#echo @[_opsp(sp, 'keys')]
242+
}
243+
244+
f
245+
EOF
200246
}
201247

202248
demo() {

mycpp/gc_dict.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ void Dict<K, V>::reserve(int num_desired) {
233233
values_ = NewSlab<V>(capacity_);
234234

235235
if (old_k != nullptr) { // rehash if there were any entries
236+
//log("REHASH num_desired %d", num_desired);
236237
len_ = 0;
237238
for (int i = 0; i < old_len; ++i) {
238239
set(old_k->items_[i], old_v->items_[i]);

mycpp/gc_dict_test.cc

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include "mycpp/gc_dict.h"
22

3+
#include <unordered_map>
4+
35
#include "mycpp/gc_mylib.h"
46
#include "vendor/greatest.h"
57

@@ -686,6 +688,31 @@ TEST test_hash() {
686688
PASS();
687689
}
688690

691+
TEST hash_pileup_bug() {
692+
auto* d = Alloc<Dict<mops::BigInt, BigStr*>>();
693+
694+
std::unordered_map<unsigned, bool> hist;
695+
696+
for (int i = 0; i < 24; ++i) {
697+
mops::BigInt index {1 << i};
698+
log("index %ld", index);
699+
700+
for (mops::BigInt j = index; j < index + 2000; ++j) {
701+
d->set(j, kEmptyString);
702+
unsigned h = hash_key(j);
703+
hist[h] = true;
704+
// log("%ld %d", j, h);
705+
}
706+
log("len %d", len(d));
707+
}
708+
709+
log("len %d", len(d));
710+
log("unique hashes %d", hist.size());
711+
712+
PASS();
713+
}
714+
715+
689716
GREATEST_MAIN_DEFS();
690717

691718
int main(int argc, char** argv) {
@@ -710,6 +737,7 @@ int main(int argc, char** argv) {
710737
RUN_TEST(dict_iters_test);
711738

712739
RUN_TEST(test_hash);
740+
RUN_TEST(hash_pileup_bug);
713741

714742
gHeap.CleanProcessExit();
715743

mycpp/hash.cc

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,19 @@ unsigned hash_key(BigStr* s) {
2121
}
2222

2323
unsigned hash_key(int n) {
24-
return n;
24+
return fnv1(reinterpret_cast<const char*>(&n), sizeof(n));
2525
}
2626

2727
unsigned hash_key(mops::BigInt n) {
28-
// Cast to a smaller type. Is there anything better we can do?
29-
return static_cast<unsigned>(n);
28+
// Bug fix: our dict sizing is a power of 2, and we don't want integers in
29+
// the workload to interact badly with it.
30+
return fnv1(reinterpret_cast<const char*>(&n), sizeof(n));
31+
}
32+
33+
unsigned hash_key(void* p) {
34+
// e.g. for Dict<Token*, int>, hash the pointer itself, which means we use
35+
// object IDENTITY, not value.
36+
return fnv1(reinterpret_cast<const char*>(&p), sizeof(void*));
3037
}
3138

3239
unsigned hash_key(Tuple2<int, int>* t1) {
@@ -36,9 +43,3 @@ unsigned hash_key(Tuple2<int, int>* t1) {
3643
unsigned hash_key(Tuple2<BigStr*, int>* t1) {
3744
return t1->at0()->hash(fnv1) + t1->at1();
3845
}
39-
40-
// e.g. for Dict<Token*, int>, hash the pointer itself, which means we use
41-
// object IDENTITY, not value.
42-
unsigned hash_key(void* p) {
43-
return fnv1(reinterpret_cast<const char*>(&p), sizeof(void*));
44-
}

0 commit comments

Comments
 (0)